diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8bb28084159e8..98fe923147ccc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -91,64 +91,73 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); + + const SIRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetRegisterClass *V32RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(32); + addRegisterClass(MVT::f32, V32RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - const SIRegisterInfo *TRI = STI.getRegisterInfo(); - const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); + const TargetRegisterClass *V64RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(64); addRegisterClass(MVT::f64, V64RegClass); addRegisterClass(MVT::v2f32, V64RegClass); addRegisterClass(MVT::Untyped, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96)); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128)); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160)); addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); - addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass); + addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224)); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); - addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass); + addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288)); addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); - addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass); + addRegisterClass(MVT::v10f32, + TRI->getDefaultVectorSuperClassForBitWidth(320)); addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); - addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass); + addRegisterClass(MVT::v11f32, + TRI->getDefaultVectorSuperClassForBitWidth(352)); addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); - addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass); + addRegisterClass(MVT::v12f32, + TRI->getDefaultVectorSuperClassForBitWidth(384)); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, + TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v16f64, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { if (Subtarget->useRealTrue16Insts()) { @@ -180,7 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); computeRegisterProperties(Subtarget->getRegisterInfo()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index a6c1af24e13e9..3f52e8229ac08 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3557,6 +3557,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { : getAnyVectorSuperClassForBitWidth(BitWidth); } +const TargetRegisterClass * +SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const { + // TODO: In principle this should use AV classes for gfx908 too. This is + // limited to 90a+ to avoid regressing special case copy optimizations which + // need new handling. The core issue is that it's not possible to directly + // copy between AGPRs on gfx908, and the current optimizations around that + // expect to see copies to VGPR. + return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth) + : getVGPRClassForBitWidth(BitWidth); +} + const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth == 16 || BitWidth == 32) @@ -3741,18 +3752,11 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { - unsigned SrcSize = getRegSizeInBits(*SrcRC); - unsigned DstSize = getRegSizeInBits(*DstRC); + // TODO: This should be more aggressive, but be more cautious with very wide + // tuples. unsigned NewSize = getRegSizeInBits(*NewRC); - - // Do not increase size of registers beyond dword, we would need to allocate - // adjacent registers and constraint regalloc more than needed. - - // Always allow dword coalescing. - if (SrcSize <= 32 || DstSize <= 32) - return true; - - return NewSize <= DstSize || NewSize <= SrcSize; + return NewSize <= 128 || NewSize <= getRegSizeInBits(*SrcRC) || + NewSize <= getRegSizeInBits(*DstRC); } unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 7b91ba7bc581f..6e119e5e7c194 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -215,6 +215,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY + const TargetRegisterClass * + getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 2351c969d5e49..9a90faf723461 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -8,17 +8,16 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v2, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v2, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v3, v4, v[1:2] +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_no_zext: @@ -26,19 +25,17 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1] -; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[0:1] +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v5, v7 -; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v4, v[6:7] +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[2:3] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -58,18 +55,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dword v4, v3, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] +; GFX10-NEXT: global_load_dword v4, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v3, v4, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src1: @@ -80,17 +75,17 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3] -; GFX11-NEXT: global_load_b32 v5, v2, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v1, s[2:3] +; GFX11-NEXT: global_load_b32 v5, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v5, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -110,18 +105,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] +; GFX10-NEXT: global_load_dword v4, v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src0: @@ -135,14 +128,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -209,18 +202,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dword v4, v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_masked_src0_hi: @@ -234,14 +225,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -389,22 +380,20 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xfff00000, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v5, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v5, v4, v[1:2] +; GFX10-NEXT: v_and_b32_e32 v1, 0xf00f, v2 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v1, v3, v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_partially_masked_src0: @@ -414,24 +403,22 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] +; GFX11-NEXT: global_load_b64 v[1:2], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[3:4], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0 +; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1] -; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v3, 0 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v4, v[1:2] +; GFX11-NEXT: v_and_b32_e32 v4, 0xf00f, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v4, v3, v[5:6] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -536,28 +523,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[5:6], v0, s[4:5] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] +; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[3:4] ; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2] -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX11-NEXT: .LBB10_2: ; %Flow ; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 +; GFX11-NEXT: v_mul_lo_u32 v1, v3, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: .LBB10_4: ; %endif ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 7f10ee4c17450..3eecaccf0308f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -741,12 +741,13 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v7, v1 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1] -; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0 -; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9] -; GCN-NEXT: v_mov_b32_e32 v2, v10 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v4, v[0:1] +; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v9, v[10:11] +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v4, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v9, v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i96: @@ -754,26 +755,26 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; GFX10-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9] -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9] +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v7, v4, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v8, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v2, v8, v[9:10] +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i96: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v9, v3 ; GFX11-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9] -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9] +; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v7, v4, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v9, 0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v9, v[10:11] +; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v4, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v9, v[10:11] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i96: @@ -784,16 +785,16 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0 +; GFX12-NEXT: v_mov_b32_e32 v8, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] -; GFX12-NEXT: v_mov_b32_e32 v2, v8 +; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v7, v4, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v8, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v2, v8, v[9:10] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9] +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v6, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i96: @@ -1072,18 +1073,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX7-NEXT: v_mov_b32_e32 v10, v2 -; GFX7-NEXT: v_mov_b32_e32 v11, v3 -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v12, v4 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v11, v3 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] -; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] -; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX7-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX7-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] ; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1095,18 +1095,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v11, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v12, v4 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v11, v3 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] -; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] -; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX8-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX8-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1118,18 +1117,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v11, v3 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX9-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1140,19 +1138,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX10-NEXT: v_mov_b32_e32 v8, v0 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0 ; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[12:13], s4, v9, v5, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12] -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v14, v7, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v4, v[12:13] +; GFX10-NEXT: v_mad_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[12:13] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[3:4] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v11, v4, v[5:6] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i128: @@ -1162,16 +1160,15 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4 ; GFX11-NEXT: v_mov_b32_e32 v12, v3 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 -; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v9, v5, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0 -; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v2, v13 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v7, s0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[13:14] +; GFX11-NEXT: v_mad_u64_u32 v[13:14], vcc_lo, v8, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[13:14] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4] ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1184,28 +1181,26 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 -; GFX12-NEXT: v_mov_b32_e32 v10, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0 ; GFX12-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX12-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], null, v9, v5, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v4, v[12:13] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v2, v13 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[12:13] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v14, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6] +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[3:4] +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v11, v4, v[5:6] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i128: @@ -2409,216 +2404,204 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12 -; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX7-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] -; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc -; GFX7-NEXT: v_mov_b32_e32 v21, v22 -; GFX7-NEXT: v_mov_b32_e32 v22, v23 -; GFX7-NEXT: v_mov_b32_e32 v23, v18 -; GFX7-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] -; GFX7-NEXT: v_mul_lo_u32 v18, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] -; GFX7-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] -; GFX7-NEXT: v_mov_b32_e32 v20, v23 -; GFX7-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] -; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] -; GFX7-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_mov_b32_e32 v12, v22 -; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] -; GFX7-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] -; GFX7-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] -; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] -; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX7-NEXT: v_mov_b32_e32 v22, v26 +; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX7-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX7-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, v10 -; GFX7-NEXT: v_mov_b32_e32 v1, v13 -; GFX7-NEXT: v_mov_b32_e32 v2, v14 -; GFX7-NEXT: v_mov_b32_e32 v7, v11 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v16 +; GFX7-NEXT: v_mov_b32_e32 v1, v11 +; GFX7-NEXT: v_mov_b32_e32 v2, v12 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12 -; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX8-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc -; GFX8-NEXT: v_mov_b32_e32 v21, v22 -; GFX8-NEXT: v_mov_b32_e32 v22, v23 -; GFX8-NEXT: v_mov_b32_e32 v23, v18 -; GFX8-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] -; GFX8-NEXT: v_mul_lo_u32 v18, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] -; GFX8-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] -; GFX8-NEXT: v_mov_b32_e32 v20, v23 -; GFX8-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] -; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] -; GFX8-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v12, v22 -; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] -; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] -; GFX8-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] -; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] -; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX8-NEXT: v_mov_b32_e32 v22, v26 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX8-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX8-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, v10 -; GFX8-NEXT: v_mov_b32_e32 v1, v13 -; GFX8-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-NEXT: v_mov_b32_e32 v7, v11 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v16 +; GFX8-NEXT: v_mov_b32_e32 v1, v11 +; GFX8-NEXT: v_mov_b32_e32 v2, v12 +; GFX8-NEXT: v_mov_b32_e32 v7, v9 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12 -; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX9-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] -; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v24, vcc -; GFX9-NEXT: v_mov_b32_e32 v21, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v18 -; GFX9-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] -; GFX9-NEXT: v_mul_lo_u32 v18, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] -; GFX9-NEXT: v_mov_b32_e32 v20, v23 -; GFX9-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] -; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v12, v22 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11] -; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX9-NEXT: v_mov_b32_e32 v22, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v16, vcc +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX9-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX9-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v18, vcc -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v10 -; GFX9-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v29, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v16 +; GFX9-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-NEXT: v_mov_b32_e32 v7, v9 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: @@ -2626,69 +2609,67 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 -; GFX10-NEXT: v_mul_lo_u32 v29, v4, v11 -; GFX10-NEXT: v_mul_lo_u32 v31, v3, v12 -; GFX10-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX10-NEXT: v_mov_b32_e32 v18, v2 +; GFX10-NEXT: v_mov_b32_e32 v19, v3 +; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mul_lo_u32 v28, v17, v14 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v12, 0 -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1] +; GFX10-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v17, v13, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v18, v12, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v12, 0 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v19, v11, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v11, v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21] -; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, v4, v10, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] ; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v10, 0 -; GFX10-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v18, v23 -; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 -; GFX10-NEXT: v_mul_lo_u32 v23, v6, v9 -; GFX10-NEXT: v_mov_b32_e32 v19, v24 -; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10 -; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v27, v16, v15 -; GFX10-NEXT: v_mov_b32_e32 v18, v21 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[25:26], s4, v6, v8, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[2:3] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] +; GFX10-NEXT: v_mov_b32_e32 v23, v25 +; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] +; GFX10-NEXT: v_add_co_ci_u32_e64 v29, s4, 0, v20, s4 +; GFX10-NEXT: v_mov_b32_e32 v20, v3 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v17, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[24:25], s6, v16, v11, v[20:21] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22] -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v32, s6 -; GFX10-NEXT: v_mov_b32_e32 v14, v20 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22] -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, 0, v15, s8 -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v4, v12, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v6, v13, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v9, v14, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v15, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v25, v27, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v28, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v30, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v31, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v29, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s5, v18, v11, v[22:23] +; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX10-NEXT: v_mul_lo_u32 v23, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v17, v10, v[24:25] +; GFX10-NEXT: v_mul_lo_u32 v24, v19, v12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s6, 0, v3, s6 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s7, v19, v10, v[20:21] +; GFX10-NEXT: v_mul_lo_u32 v25, v18, v13 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s6, v18, v9, v[14:15] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v3, s6 +; GFX10-NEXT: v_mad_u64_u32 v[13:14], s6, v4, v9, v[11:12] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v16, v9, v[1:2] +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s8, v19, v8, v[20:21] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s8, 0, v15, s8 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s8, v5, v8, v[13:14] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[3:4] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v16, v9, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v10, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v15, v11, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v6, v12, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v26, v22, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v23, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v30, s5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v28, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v27, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2696,69 +2677,68 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7 -; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v8 :: v_dual_mov_b32 v21, v7 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mul_lo_u32 v28, v16, v15 -; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14 -; GFX11-NEXT: v_mul_lo_u32 v32, v3, v12 -; GFX11-NEXT: v_mul_lo_u32 v31, v2, v13 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v12, 0 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX11-NEXT: v_mul_lo_u32 v31, v17, v14 +; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX11-NEXT: v_mul_lo_u32 v15, v16, v15 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v17, v13, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v18, v12, v[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v12, 0 +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v11, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v11, v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v10, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v18, v10, v[0:1] ; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] -; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8] -; GFX11-NEXT: v_mov_b32_e32 v7, v23 -; GFX11-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 -; GFX11-NEXT: v_mul_lo_u32 v23, v6, v9 -; GFX11-NEXT: v_mov_b32_e32 v8, v24 -; GFX11-NEXT: v_mul_lo_u32 v24, v5, v10 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8] -; GFX11-NEXT: v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21 -; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22] -; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v8, s2 -; GFX11-NEXT: v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12] -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22] -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7] -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, s4 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v8, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v27, v9, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v12, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v7, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v25, v28, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v32, s3 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[25:26], vcc_lo, v19, v9, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v6, v20, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[2:3] +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v4, v20, v[25:26] +; GFX11-NEXT: v_mov_b32_e32 v25, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v18, v20, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v6, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[24:25] +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v22, s0 +; GFX11-NEXT: v_mov_b32_e32 v22, v3 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s0, v17, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[24:25], s2, v16, v11, v[22:23] +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v20, 0 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s1, v18, v11, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[24:25] +; GFX11-NEXT: v_mul_lo_u32 v24, v19, v12 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s3, v19, v10, v[22:23] +; GFX11-NEXT: v_mul_lo_u32 v22, v18, v13 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], s2, v18, v9, v[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v3, s2 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[11:12] +; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v16, v9, v[1:2] +; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], s4, v19, v20, v[13:14] +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v18, s4 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s4, v5, v20, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v20, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v13, v11, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v12, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v8, v15, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v22, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v23, s0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10] +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v27, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v21, v20, v[9:10] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i256: @@ -2769,103 +2749,99 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX12-NEXT: v_mul_lo_u32 v29, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v31, v3, v12 -; GFX12-NEXT: v_mul_lo_u32 v30, v2, v13 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 +; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mul_lo_u32 v28, v17, v14 -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v12, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] +; GFX12-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v17, v13, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v18, v12, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v12, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v19, v11, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v11, v[2:3] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21] -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v10, v[20:21] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v10, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v10, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[25:26], null, v6, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mov_b32_e32 v18, v23 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 -; GFX12-NEXT: v_mul_lo_u32 v23, v6, v9 -; GFX12-NEXT: v_mov_b32_e32 v19, v24 -; GFX12-NEXT: v_mul_lo_u32 v24, v5, v10 -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19] -; GFX12-NEXT: v_mov_b32_e32 v19, v22 -; GFX12-NEXT: v_mul_lo_u32 v27, v16, v15 -; GFX12-NEXT: v_mov_b32_e32 v18, v21 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v23, v25 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v32, 0, 1, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22] -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 +; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v20, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v20, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], s2, v16, v11, v[20:21] +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s1, v18, v11, v[22:23] +; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX12-NEXT: v_mul_lo_u32 v23, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v10, v[24:25] +; GFX12-NEXT: v_mul_lo_u32 v24, v19, v12 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v32, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12] +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s3, v19, v10, v[20:21] +; GFX12-NEXT: v_mul_lo_u32 v25, v18, v13 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s2, v18, v9, v[14:15] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22] +; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], s2, v4, v9, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v16, v9, v[1:2] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19] +; GFX12-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], s4, v19, v8, v[20:21] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v15, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s4, v5, v8, v[13:14] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[3:4] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v12, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v6, v13, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v9, v14, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v15, v11, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v15, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v6, v12, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v25, v27, s5 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s4 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v26, v22, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s2 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v31, s3 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s4 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s3 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v29, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3155,8 +3131,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v6, v[5:6] ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; @@ -3167,8 +3143,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; @@ -3179,8 +3155,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3200,8 +3176,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4 -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4] -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v5, v3 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, 0x50, v6, v[5:6] ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 2843f72353db1..b7c84f1389197 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -31,102 +31,100 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v8, v6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v8 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8] +; CHECK-NEXT: v_mul_lo_u32 v7, v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9] +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v3, v10 +; CHECK-NEXT: v_mul_lo_u32 v11, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v12, v10 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v3 -; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 -; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v11, v9 -; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 -; CHECK-NEXT: v_mul_hi_u32 v4, v11, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v9 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v7, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8] +; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9] +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v4, v13 +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, v10 +; CHECK-NEXT: v_xor_b32_e32 v11, v5, v13 +; CHECK-NEXT: v_mul_hi_u32 v5, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v3, v10 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v10 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v9, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc -; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v6, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v12, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v7 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] @@ -136,8 +134,8 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v10 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 @@ -150,9 +148,9 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -218,10 +216,10 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_subb_u32 s5, 0, s11 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] ; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -292,11 +290,11 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, s13, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, s13, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] @@ -379,266 +377,260 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc ; GISEL-NEXT: v_xor_b32_e32 v10, v4, v8 -; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GISEL-NEXT: v_xor_b32_e32 v9, v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v9 ; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v9, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v13 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v5 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v0, v16, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v5 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v18, v0 ; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1 ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v5 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13] -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v4, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v15, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v10 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v4 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v14 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v15, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v22, v14 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v11, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v11, v4 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v15, v[11:12] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v18, v13 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v4, v9, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v7, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v6, v4 +; GISEL-NEXT: v_xor_b32_e32 v6, v17, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v17, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v18, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 +; GISEL-NEXT: v_mac_f32_e32 v17, 0x4f800000, v18 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v15 -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v17, v2, v15 -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v1 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v12, v[8:9] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 ; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v17, v3, v15 ; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v17, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v17, v1 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v13 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4] +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v8, v17, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v13, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v12, v[8:9] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v8 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v17, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v15, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -667,28 +659,28 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v5, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] -; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v3 -; CGP-NEXT: v_mul_lo_u32 v13, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 -; CGP-NEXT: v_mul_lo_u32 v18, v17, v4 +; CGP-NEXT: v_trunc_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13] +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v18, v14, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 @@ -696,44 +688,44 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v15 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v15, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v5, v15 -; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 -; CGP-NEXT: v_mul_lo_u32 v11, v14, v4 -; CGP-NEXT: v_xor_b32_e32 v13, v10, v15 -; CGP-NEXT: v_mul_hi_u32 v10, v14, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v3 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v16 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v16 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v4 +; CGP-NEXT: v_xor_b32_e32 v13, v10, v16 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v14, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v15, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v10, v12, v4 ; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 @@ -751,12 +743,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v3 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v4 ; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5] +; CGP-NEXT: v_add_i32_e32 v15, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v15, v[4:5] ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11] ; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc @@ -771,7 +763,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 @@ -785,8 +777,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v15, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v16, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -840,28 +832,28 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v7, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v12, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 -; CGP-NEXT: v_mul_lo_u32 v16, v15, v6 +; CGP-NEXT: v_trunc_f32_e32 v6, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v7, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v13, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 @@ -869,53 +861,53 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v13 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 -; CGP-NEXT: v_mul_lo_u32 v9, v12, v6 -; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v14 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v7, v14 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 +; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v15, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v14, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v14, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v14, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 @@ -924,16 +916,16 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v6 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v15, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7] +; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 ; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v14, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v14, v9 +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] @@ -944,7 +936,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v15, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 @@ -958,8 +950,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v13, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -1054,10 +1046,10 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] ; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 @@ -1133,11 +1125,11 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v5, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v2 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc @@ -1186,155 +1178,152 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v16 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v15 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v16 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 -; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb +; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v20, vcc, v15, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v18 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v20, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v17 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v20, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v16 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v18, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v18, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v20, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v12, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, v2, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v20, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v9, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1356,62 +1345,61 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v9, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1424,178 +1412,175 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 -; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 -; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v1, v19, v1 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v1, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc +; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v16 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc +; CGP-NEXT: v_add_i32_e32 v18, vcc, 1, v15 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v20, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v12, v18, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v18 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_addc_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -1610,11 +1595,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CGP-NEXT: v_mul_hi_u32 v6, v8, v2 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v6, v5 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc @@ -1640,10 +1625,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -1679,28 +1664,28 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CHECK-NEXT: v_trunc_f32_e32 v7, v6 -; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] -; CHECK-NEXT: v_mul_lo_u32 v6, v14, v5 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 -; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v12, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 @@ -1708,53 +1693,53 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v5 -; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] -; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 -; CHECK-NEXT: v_mul_lo_u32 v3, v14, v5 -; CHECK-NEXT: v_mul_lo_u32 v6, v11, v9 -; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 -; CHECK-NEXT: v_mul_hi_u32 v4, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v13, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v13 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v9 +; CHECK-NEXT: v_xor_b32_e32 v14, v4, v13 +; CHECK-NEXT: v_mul_hi_u32 v4, v12, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v14, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 ; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v14, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v14, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 @@ -1763,16 +1748,16 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v4 ; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v6, v5 ; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc -; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] @@ -1797,7 +1782,7 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 +; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 @@ -1839,274 +1824,268 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v9, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v10, 0 -; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4 -; GISEL-NEXT: v_lshl_b64 v[9:10], v[9:10], v6 +; GISEL-NEXT: v_mov_b32_e32 v12, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v13, 0 +; GISEL-NEXT: v_lshl_b64 v[7:8], v[12:13], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v8, v5, v4 -; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v8 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v5, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 -; GISEL-NEXT: v_trunc_f32_e32 v13, v11 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v7, v19, v11 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v5, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v11, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v5, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v11, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v5, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v19, v[11:12] +; GISEL-NEXT: v_mul_lo_u32 v9, v19, v10 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], v6 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v5, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v16 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v7 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 +; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v1, v9, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v14, v0, v9 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v16 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v16 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v5, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v4, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13] -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v15, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v16, 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v10, v5 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v17, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v10, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v13, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v6, v5 +; GISEL-NEXT: v_xor_b32_e32 v6, v12, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v14 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v7, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 +; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v5 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v14 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v15, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v22, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v12, v1, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v1, v15 -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 -; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v14, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v1 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v13, v[7:8] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v9 ; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v11 +; GISEL-NEXT: v_xor_b32_e32 v16, v3, v9 ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v5, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v11, v15, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v16, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v14, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v13, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v13 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v14, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2138,28 +2117,28 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1 -; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v18, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v19, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 ; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 -; CGP-NEXT: v_trunc_f32_e32 v12, v11 -; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v19, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] -; CGP-NEXT: v_mul_lo_u32 v11, v19, v10 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; CGP-NEXT: v_mul_hi_u32 v12, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 -; CGP-NEXT: v_mul_lo_u32 v13, v16, v14 -; CGP-NEXT: v_mul_lo_u32 v15, v19, v14 +; CGP-NEXT: v_trunc_f32_e32 v11, v11 +; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v11 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v17, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v12, v17, v14 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 @@ -2167,53 +2146,53 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v19, v14 +; CGP-NEXT: v_mul_hi_u32 v13, v16, v14 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v10 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 -; CGP-NEXT: v_mul_lo_u32 v8, v19, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v10 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v18, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v18 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v18, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v18 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v9, v18 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v19, v14 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v17, v14 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v19, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v19, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v18, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v17, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v19, v8 ; CGP-NEXT: v_mul_lo_u32 v11, v15, v9 ; CGP-NEXT: v_mul_hi_u32 v12, v15, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v18, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v19, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v18, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v19, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v15, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 @@ -2222,16 +2201,16 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v12, v18, v9 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v19, v9 ; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v10 ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 ; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11] -; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v18, v12, vcc -; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v18, v12 +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v19, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v19, v12 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 ; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] @@ -2256,7 +2235,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v17, v0 +; CGP-NEXT: v_xor_b32_e32 v8, v18, v0 ; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 @@ -2313,102 +2292,100 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v10, v8 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v16, v8 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v16, v11 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v10, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v6 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 -; CGP-NEXT: v_mul_lo_u32 v5, v16, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v15 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v6, v12 +; CGP-NEXT: v_xor_b32_e32 v13, v7, v15 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v16, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v14, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v16, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v15, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v14, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v13, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] @@ -2418,8 +2395,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v14, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 @@ -2432,9 +2409,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v15, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -2538,28 +2515,29 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 @@ -2567,168 +2545,163 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v9, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v3 -; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6] -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v12 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v7 -; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 -; GISEL-NEXT: v_trunc_f32_e32 v8, v6 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v2 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[8:9] +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v10 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v0, v[5:6] +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v7 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 ; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 ; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v8 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 -; GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 +; GISEL-NEXT: v_sub_i32_e64 v17, s[4:5], 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v2, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v17, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, -1, v6, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v17, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v17, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v9 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v2 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v5 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3] ; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7] -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8] ; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v8 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v5 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v14, v9 ; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v5, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0 -; GISEL-NEXT: v_mul_hi_u32 v14, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v5 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v13, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v11, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v9 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -2736,8 +2709,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2748,8 +2721,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index f4489c2239fda..9d6ffc9bbc0dc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -172,11 +172,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: s_subb_u32 s15, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -247,11 +247,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] @@ -333,11 +333,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: s_subb_u32 s11, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -400,20 +400,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v7, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v7, v[2:3] ; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4 @@ -421,19 +421,19 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v7 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc @@ -442,15 +442,15 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v6 +; GFX9-NEXT: v_xor_b32_e32 v1, s1, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc @@ -554,29 +554,29 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 ; GFX10-NEXT: v_add_co_u32 v0, s8, v5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 +; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s8, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 -; GFX10-NEXT: v_add_co_u32 v5, s8, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 +; GFX10-NEXT: v_add_co_u32 v6, s8, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v6, 0 +; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s6, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v5, v[1:2] -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v6, v[1:2] +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 @@ -590,16 +590,16 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0 ; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v5, s0, 0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s5, v3 @@ -1308,11 +1308,11 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_subb_u32 s17, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1386,163 +1386,162 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_ashr_i32 s10, s3, 31 ; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v9 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], 1, v6 -; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v9 +; GFX8-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s8, v3 +; GFX8-NEXT: s_ashr_i32 s8, s3, 31 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v3 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s18, s6 -; GFX8-NEXT: s_addc_u32 s1, s19, s6 -; GFX8-NEXT: s_add_u32 s2, s2, s10 -; GFX8-NEXT: s_mov_b32 s11, s10 -; GFX8-NEXT: s_addc_u32 s3, s3, s10 -; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v4 -; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v5 +; GFX8-NEXT: s_add_u32 s10, s18, s6 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: s_addc_u32 s11, s19, s6 +; GFX8-NEXT: s_add_u32 s0, s2, s8 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_addc_u32 s1, s3, s8 +; GFX8-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9] +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] -; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v11, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v11 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] ; GFX8-NEXT: s_sub_u32 s5, 0, s2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc ; GFX8-NEXT: s_subb_u32 s20, 0, s3 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v1, v11, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v1, v12, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, v11, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v12, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v6 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] -; GFX8-NEXT: v_xor_b32_e32 v5, s17, v7 -; GFX8-NEXT: v_mov_b32_e32 v6, s17 -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v12, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4] +; GFX8-NEXT: v_xor_b32_e32 v6, s16, v6 +; GFX8-NEXT: v_xor_b32_e32 v1, s17, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, s17 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v6 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5] ; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9 -; GFX8-NEXT: v_mul_lo_u32 v7, v10, v5 +; GFX8-NEXT: v_mul_lo_u32 v7, v10, v6 ; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2 ; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v8 +; GFX8-NEXT: v_xor_b32_e32 v5, s4, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v11, v5 +; GFX8-NEXT: v_mul_lo_u32 v9, v11, v6 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_mul_hi_u32 v7, v10, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, v10, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v5, v11, v5 +; GFX8-NEXT: v_mul_hi_u32 v6, v11, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v10, s8, v7 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v11, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, s10, v6 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s8, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v10 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, s9, v7 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v6 +; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3 -; GFX8-NEXT: v_mul_hi_u32 v7, s9, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v6, s11, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v6, v7 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v12, s9 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 +; GFX8-NEXT: v_mov_b32_e32 v12, s11 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s10, v2 ; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc -; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s9, v8 +; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s11, v8 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 @@ -1575,7 +1574,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] +; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 ; GFX8-NEXT: v_mov_b32_e32 v7, s1 @@ -1619,11 +1618,11 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_subb_u32 s17, 0, s9 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1703,152 +1702,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_ashr_i32 s10, s3, 31 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s18, s6 -; GFX9-NEXT: s_addc_u32 s1, s19, s6 -; GFX9-NEXT: s_add_u32 s2, s2, s10 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s3, s3, s10 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v14 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v9 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v3 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v5 +; GFX9-NEXT: s_add_u32 s10, s18, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: s_addc_u32 s11, s19, s6 +; GFX9-NEXT: s_add_u32 s0, s2, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s3, s8 +; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9] +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v12, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v5, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v13, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v1, v12, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v11, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v11, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v11, v4 +; GFX9-NEXT: v_xor_b32_e32 v6, s16, v6 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v2, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_xor_b32_e32 v1, s16, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, s17 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v12, vcc -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v5 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v12, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v0 +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4] +; GFX9-NEXT: v_xor_b32_e32 v1, s17, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s17 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v4, v11, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s4, v9 +; GFX9-NEXT: v_mul_lo_u32 v7, v10, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, v10, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 -; GFX9-NEXT: v_mul_hi_u32 v6, v10, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v9 -; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 +; GFX9-NEXT: v_xor_b32_e32 v5, s4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, v11, v6 +; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v11, v6 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 -; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX9-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_add3_u32 v11, v6, v11, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s11, v6 +; GFX9-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v8, v3 +; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX9-NEXT: v_add3_u32 v11, v7, v9, v6 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v12, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NEXT: v_mov_b32_e32 v12, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 -; GFX9-NEXT: v_sub_u32_e32 v7, s9, v8 +; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] @@ -1880,7 +1878,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] +; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 ; GFX9-NEXT: v_mov_b32_e32 v7, s1 @@ -1917,21 +1915,21 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: s_subb_u32 s20, 0, s7 ; GFX10-NEXT: s_xor_b64 s[16:17], s[4:5], s[8:9] ; GFX10-NEXT: s_ashr_i32 s8, s19, 31 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_ashr_i32 s10, s3, 31 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_add_u32 s18, s18, s8 ; GFX10-NEXT: s_addc_u32 s19, s19, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_add_u32 s2, s2, s10 ; GFX10-NEXT: s_mov_b32 s11, s10 ; GFX10-NEXT: s_addc_u32 s3, s3, s10 -; GFX10-NEXT: s_mov_b32 s9, s8 -; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] +; GFX10-NEXT: s_mov_b32 s9, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9] ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9] ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1940,256 +1938,253 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_trunc_f32_e32 v6, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v6 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v7, 0 +; GFX10-NEXT: v_trunc_f32_e32 v5, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v5 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v6, 0 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX10-NEXT: s_sub_u32 s5, 0, s2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GFX10-NEXT: v_mul_hi_u32 v10, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s22, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2] -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s5, s21, v7, v[1:2] +; GFX10-NEXT: s_sub_u32 s5, 0, s2 +; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s23, s5, v8, 0 ; GFX10-NEXT: s_subb_u32 s22, 0, s3 -; GFX10-NEXT: v_mul_hi_u32 v12, v8, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, v5, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX10-NEXT: v_mul_lo_u32 v13, v7, v3 -; GFX10-NEXT: v_mul_lo_u32 v14, v9, v3 -; GFX10-NEXT: v_mul_hi_u32 v15, v7, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1] -; GFX10-NEXT: v_mul_hi_u32 v1, v9, v3 -; GFX10-NEXT: v_add_co_u32 v3, s23, v4, v13 +; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v6, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s23, s5, v9, v[2:3] +; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1 +; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3 +; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3 +; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1 +; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[4:5] +; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3 +; GFX10-NEXT: v_add_co_u32 v3, s23, v10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v10, s23, v14, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s23 -; GFX10-NEXT: v_mul_lo_u32 v14, v8, v0 -; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v6 +; GFX10-NEXT: v_add_co_u32 v10, s23, v15, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 +; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0 +; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v6, s23, v10, v15 -; GFX10-NEXT: v_mul_lo_u32 v15, v5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0 +; GFX10-NEXT: v_add_co_u32 v10, s23, v10, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 ; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v17, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v17, v9, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v4, v3 -; GFX10-NEXT: v_add_co_u32 v4, s23, v11, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v13, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v15, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v0, s23, v6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v4, s23, v4, v12 +; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v13 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v16 -; GFX10-NEXT: v_add3_u32 v1, v3, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v7, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX10-NEXT: v_add_co_u32 v11, s23, v15, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, s23, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v0 +; GFX10-NEXT: v_add3_u32 v1, v3, v10, v1 +; GFX10-NEXT: v_add_co_u32 v5, s23, v11, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v1, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s21, v6, 0 -; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_add_co_u32 v2, s23, v5, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, v7, v0 -; GFX10-NEXT: v_add3_u32 v3, v4, v3, v17 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s23, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, v9, v2 -; GFX10-NEXT: v_mul_hi_u32 v13, v8, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v7, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, v6, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v9, v2 -; GFX10-NEXT: v_mul_lo_u32 v14, v6, v3 +; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX10-NEXT: v_add3_u32 v5, v3, v4, v17 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s21, s21, v7, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s21, s5, v8, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v5, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0 +; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s5, s5, v9, v[2:3] +; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1 +; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3 ; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3 +; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1 ; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3 -; GFX10-NEXT: v_add_co_u32 v3, s5, v4, v14 +; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v15, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v5 -; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0 +; GFX10-NEXT: v_add_co_u32 v10, s5, v15, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v5, s5, v11, v16 -; GFX10-NEXT: v_mul_lo_u32 v16, v9, v0 +; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0 +; GFX10-NEXT: v_add_co_u32 v10, s5, v10, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v14, v11 -; GFX10-NEXT: v_add_co_u32 v11, s5, v12, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v5, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v11, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v17 -; GFX10-NEXT: v_add3_u32 v1, v4, v5, v1 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3 +; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v13 +; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v12, s5, v15, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3 +; GFX10-NEXT: v_add3_u32 v1, v4, v10, v1 +; GFX10-NEXT: v_add_co_u32 v5, s5, v12, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v11, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0 ; GFX10-NEXT: v_mul_lo_u32 v6, s1, v3 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v14, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_mul_lo_u32 v11, s0, v1 +; GFX10-NEXT: v_add_co_u32 v2, s5, v5, v2 +; GFX10-NEXT: v_mul_lo_u32 v10, s0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v13, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 ; GFX10-NEXT: v_mul_hi_u32 v7, s0, v3 ; GFX10-NEXT: v_mul_hi_u32 v3, s1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, s1, v1 -; GFX10-NEXT: v_add3_u32 v0, v5, v4, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s1, v1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 +; GFX10-NEXT: v_add3_u32 v0, v4, v5, v0 ; GFX10-NEXT: v_mul_hi_u32 v4, s0, v1 ; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1 -; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v11 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v12, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v11, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v7 -; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v12, s18, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 ; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v4 +; GFX10-NEXT: v_mul_lo_u32 v7, s18, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 ; GFX10-NEXT: v_mul_hi_u32 v9, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 ; GFX10-NEXT: v_mul_hi_u32 v2, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s19, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, s5, v0, v12 -; GFX10-NEXT: v_mul_hi_u32 v13, s18, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v12, s5, v3, v1 -; GFX10-NEXT: v_add_co_u32 v2, s20, v7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v12, 0 -; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v9, s5, v2, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 -; GFX10-NEXT: v_add3_u32 v4, v4, v7, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v11, v6 -; GFX10-NEXT: v_mul_hi_u32 v5, s19, v8 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v6, s19, v8 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v1 +; GFX10-NEXT: v_add_co_u32 v7, s20, v0, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v10, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v3, 0 +; GFX10-NEXT: v_mul_hi_u32 v11, s18, v8 +; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-NEXT: v_add3_u32 v4, v4, v12, v5 +; GFX10-NEXT: v_add_co_u32 v2, s5, v7, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s20 +; GFX10-NEXT: v_mul_hi_u32 v7, s19, v8 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2] -; GFX10-NEXT: v_add_co_u32 v6, s5, v9, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v7, 1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2] -; GFX10-NEXT: v_add3_u32 v5, v3, v9, v5 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, s2, v6, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, s1, v1 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v14, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v14 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v10, v5 +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v3, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v4, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v8, v13, v2 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v3, v[1:2] +; GFX10-NEXT: v_add_co_u32 v5, s5, v6, v5 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v13, s0, s1, v1, vcc_lo +; GFX10-NEXT: v_add3_u32 v7, v8, v2, v7 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v12, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v17, v14, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v19, v18, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v20, v17, s0 -; GFX10-NEXT: v_sub_co_u32 v1, s0, v3, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1] -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s1, s19, v0, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v14, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v8 -; GFX10-NEXT: v_xor_b32_e32 v1, s16, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_xor_b32_e32 v4, s17, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 -; GFX10-NEXT: v_xor_b32_e32 v3, s4, v3 -; GFX10-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v19, v18, s0 +; GFX10-NEXT: v_add_co_u32 v18, s0, v10, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v19, s0, 0, v11, s0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s2, v7, v[1:2] +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX10-NEXT: v_sub_co_u32 v2, s0, v15, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v2, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s3, v5, v[1:2] +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v6, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, s18, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s1, s19, v1, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s19, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v10 +; GFX10-NEXT: v_xor_b32_e32 v0, s16, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 +; GFX10-NEXT: v_xor_b32_e32 v2, s17, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v2, s2 +; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v8, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v1, s16 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v8 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s16 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v2, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v10 +; GFX10-NEXT: v_xor_b32_e32 v2, s4, v6 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v15, s0, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v5, s0 +; GFX10-NEXT: v_add_co_u32 v15, s0, v5, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v7, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 ; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_sub_co_u32 v9, s0, v13, s2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_sub_co_u32 v6, s0, v13, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v5, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v11, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v11, s4, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v5, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s0 ; GFX10-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4 -; GFX10-NEXT: v_xor_b32_e32 v3, s0, v6 -; GFX10-NEXT: v_xor_b32_e32 v6, s1, v11 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v7, s8, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, s8, v8 -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v8, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[14:15] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, s4 +; GFX10-NEXT: v_xor_b32_e32 v2, s0, v12 +; GFX10-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX10-NEXT: v_xor_b32_e32 v8, s8, v3 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v11, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v10, s8, v6 +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v10, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 6f42239cd191d..39cf7b01fd6c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -31,28 +31,28 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v6, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v6 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 @@ -60,53 +60,53 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v11 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v11, vcc -; CHECK-NEXT: v_xor_b32_e32 v9, v3, v11 -; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v10, v8 -; CHECK-NEXT: v_xor_b32_e32 v12, v4, v11 -; CHECK-NEXT: v_mul_hi_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v8 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v13, v8 +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v10, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v13, v2 ; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 ; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v13, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 @@ -115,16 +115,16 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v6 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -149,10 +149,10 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -212,10 +212,10 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_subb_u32 s5, 0, s9 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] ; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -286,11 +286,11 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, s11, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, s11, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2] ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 @@ -372,212 +372,209 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v11, v9 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v9 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v4, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v9 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v4, v[11:12] +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v18, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v18, v[11:12] +; GISEL-NEXT: v_xor_b32_e32 v14, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v13 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v17, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v16 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v10, v9 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2] -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10] -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v12, v9 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v15, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v15, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v18, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v19, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v19, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v13, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v14, v0 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v16, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v8 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v0, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1 +; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v17, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v1 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v18, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v1 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v10, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v15, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v14, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v14, v10 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 ; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 @@ -585,48 +582,47 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v9, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v0, v[3:4] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v10, v[8:9] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 -; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -651,28 +647,28 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v17, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v12 +; CGP-NEXT: v_trunc_f32_e32 v3, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v3, v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v4, v15, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v12 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v12 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 @@ -680,53 +676,53 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v2 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v15 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v15, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v3, v15 -; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 -; CGP-NEXT: v_xor_b32_e32 v16, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v2 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v16 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5] +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v3, v16 +; CGP-NEXT: v_mul_lo_u32 v3, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v12 +; CGP-NEXT: v_xor_b32_e32 v17, v4, v16 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v17, v12 +; CGP-NEXT: v_mul_lo_u32 v4, v14, v12 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v17, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v16, v2 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v17, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_hi_u32 v10, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v16, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v16, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v17, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_mul_hi_u32 v5, v13, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 @@ -735,16 +731,16 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v3 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v5, v4 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v10, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v10 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v10 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -769,10 +765,10 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v15 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v15 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 @@ -820,28 +816,28 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] -; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v15, v10 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v5, v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v6, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 @@ -849,53 +845,53 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v13 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 -; CGP-NEXT: v_xor_b32_e32 v14, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v14 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v10 +; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v15, v10 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v15, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v11, v5 ; CGP-NEXT: v_mul_hi_u32 v8, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v14, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v15, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 @@ -904,16 +900,16 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v8, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v8 +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v15, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v15, v8 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 @@ -938,10 +934,10 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v14 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -982,10 +978,10 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] ; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 @@ -1061,11 +1057,11 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc @@ -1112,153 +1108,150 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 -; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1280,37 +1273,36 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1330,10 +1322,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1346,176 +1338,173 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 -; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 -; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 -; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 ; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -1530,11 +1519,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc @@ -1558,10 +1547,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -1578,10 +1567,10 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] ; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 @@ -1657,11 +1646,11 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc @@ -1708,153 +1697,150 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 -; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1876,37 +1862,36 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1926,10 +1911,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -1942,176 +1927,173 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 -; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 -; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 -; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 ; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -2126,11 +2108,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc @@ -2154,10 +2136,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -2193,102 +2175,100 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v7, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v11 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v11, vcc -; CHECK-NEXT: v_xor_b32_e32 v9, v2, v11 -; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 -; CHECK-NEXT: v_xor_b32_e32 v12, v3, v11 -; CHECK-NEXT: v_mul_hi_u32 v3, v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v8, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v2, v9 +; CHECK-NEXT: v_xor_b32_e32 v10, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v2, v9 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v2, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v9, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v6 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -2313,10 +2293,10 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2351,224 +2331,220 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v8, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v9, 0 -; GISEL-NEXT: v_lshl_b64 v[4:5], v[8:9], v4 +; GISEL-NEXT: v_mov_b32_e32 v10, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_lshl_b64 v[4:5], v[10:11], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v5, v7, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v5, v7, vcc ; GISEL-NEXT: v_xor_b32_e32 v5, v4, v7 -; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v8, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v7 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v12, v10 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v18, v10 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v4, 0 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v8 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v4, v8 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v19, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v18, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v19, v[12:13] +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v16, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v17, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13 +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v8, v6 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 -; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 -; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v8 -; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1] -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v14, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11] -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v14 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v13, v10 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0 -; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v20, v7 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v7 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v19, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v7, v14, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v15, v0 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v17, v12, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v6, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v0, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v10, v1 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v0, v7, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v20, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v11 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v1 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v16, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v15, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v11 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 @@ -2577,26 +2553,25 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v0, v[3:4] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 @@ -2611,13 +2586,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -2645,103 +2620,100 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v12, v10 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v18, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v11, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v15, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v15, v13 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v16, v14 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v12, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v16 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v14, v4, v16 -; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v15, v13 -; CGP-NEXT: v_xor_b32_e32 v17, v8, v16 -; CGP-NEXT: v_mul_hi_u32 v8, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v15, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v14 +; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v18, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v17, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v14, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v18, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v18, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v17, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v18, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v17, v8 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v11, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v11, v18, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v4, 0 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v11, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v12, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v4, v[10:11] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v15, v8 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v18, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v18, v12 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 @@ -2766,10 +2738,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v17 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v17 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v17 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v17, vcc ; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 @@ -2827,76 +2799,74 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v6, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v10, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v4, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10] ; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v4, v14 -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v12 +; CGP-NEXT: v_xor_b32_e32 v11, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v7, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v6, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v12 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v11, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v10, v5 ; CGP-NEXT: v_mul_hi_u32 v8, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v11, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_mul_hi_u32 v7, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 @@ -2904,17 +2874,17 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v8, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v12, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v11, v8 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 @@ -3036,29 +3006,30 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v4 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 @@ -3066,165 +3037,160 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v0, v13, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v13, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v7 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1] -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v9, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v9 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6] -; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v7 -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v10 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, 0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v10, 0 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v9 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[5:6] +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v10, v[6:7] +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v11, 0 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, v[6:7] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v11, v[9:10] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v6 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v4, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v8, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v13, v10 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v6 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v10 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v4 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v6, v5 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v4 +; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v12, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, 0 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v0, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v11, v[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v10, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v4 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v6 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v4, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, 0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v11, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v7 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 23ef596c021c2..c50b491bcb074 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -129,11 +129,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -203,11 +203,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] @@ -268,11 +268,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -468,31 +468,31 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s17, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 +; GFX10-NEXT: v_add_co_u32 v6, s0, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v6, 0 +; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s18, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v5, v[1:2] -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v6, v[1:2] +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s16, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s17, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s17, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s18 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v8 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 @@ -503,18 +503,18 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v8 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0 -; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s18 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v5, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v5, s0 ; GFX10-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13] ; GFX10-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15] ; GFX10-NEXT: s_endpgm @@ -1005,14 +1005,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v10, s13 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1042,12 +1042,10 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 -; GFX8-NEXT: s_sub_u32 s2, 0, s14 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] ; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 -; GFX8-NEXT: s_subb_u32 s3, 0, s15 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 @@ -1084,112 +1082,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2] +; GFX8-NEXT: v_add_u32_e64 v17, s[2:3], 1, v8 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v10, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s14 ; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4 +; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s15 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15 -; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v10, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v1 -; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v9, vcc +; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX8-NEXT: v_trunc_f32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v2 -; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v8 -; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v4 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13 +; GFX8-NEXT: v_trunc_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX8-NEXT: s_sub_u32 s8, 0, s14 +; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v3 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v1 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 +; GFX8-NEXT: v_subbrev_u32_e64 v16, s[0:1], 0, v12, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v16 +; GFX8-NEXT: s_subb_u32 s9, 0, s15 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] -; GFX8-NEXT: v_mul_lo_u32 v4, v18, v2 -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v12, v10, vcc -; GFX8-NEXT: v_mul_lo_u32 v5, v15, v6 -; GFX8-NEXT: v_mul_hi_u32 v10, v15, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v18, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v10, v18, v6 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v5, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, v13, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v4, v13, v2 +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v5, v4 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v12, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, v14, v6 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, v13, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_addc_u32_e64 v18, s[2:3], 0, v10, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 1, v17 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v18, vcc +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v15 +; GFX8-NEXT: v_mul_hi_u32 v6, v14, v6 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v16 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v13 -; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v7, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v2 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v4, vcc -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v2 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v12, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v18, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, v15, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v10, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] +; GFX8-NEXT: v_mul_lo_u32 v6, v14, v4 +; GFX8-NEXT: v_mul_hi_u32 v9, v13, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v13, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v18, v7 -; GFX8-NEXT: v_mul_hi_u32 v4, v18, v4 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, v15, v7 +; GFX8-NEXT: v_mul_lo_u32 v9, v14, v8 +; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, v13, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 -; GFX8-NEXT: v_mul_hi_u32 v7, v18, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_mul_hi_u32 v8, v14, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v18, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc ; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4 ; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] @@ -1206,15 +1205,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v5, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v8 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[5:6] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s15, v11, v[8:9] @@ -1274,13 +1272,16 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1307,15 +1308,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 -; GFX9-NEXT: s_sub_u32 s2, 0, s6 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] ; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 -; GFX9-NEXT: s_subb_u32 s3, 0, s7 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 @@ -1350,134 +1348,132 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v8, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v8, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s4, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v9, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v4, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2 -; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v13 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 +; GFX9-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s4, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v16, s[0:1], 0, v12, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] -; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v12, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v15, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v15, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v18, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v14 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v18, v6 -; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v18, v6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v5, v14, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, v13, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v4, v13, v2 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v12, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, v14, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v6 +; GFX9-NEXT: v_add_co_u32_e64 v17, s[2:3], 1, v8 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[2:3], 0, v10, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v17 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v18, vcc +; GFX9-NEXT: v_mul_hi_u32 v6, v14, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v15 +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v8, v5 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v4, v5, v4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, v15, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v18, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc +; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX9-NEXT: v_mul_lo_u32 v5, v18, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v9, v15, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v18, v4 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v9, v18, v7 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, v15, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v18, v7 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] -; GFX9-NEXT: v_add3_u32 v5, v6, v5, v7 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v15, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] +; GFX9-NEXT: v_mul_lo_u32 v6, v14, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v13, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, v13, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s19, v5 -; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, v14, v8 +; GFX9-NEXT: v_mul_hi_u32 v4, v14, v4 ; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v13, s19, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v8 +; GFX9-NEXT: v_mul_hi_u32 v8, v14, v8 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v6, v7, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s18, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s19, v7 +; GFX9-NEXT: v_add_u32_e32 v1, v8, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, s18, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, s19, v7 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v8, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v1, v9, v7 -; GFX9-NEXT: v_add3_u32 v12, v1, v12, v13 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX9-NEXT: v_add3_u32 v12, v8, v1, v7 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[5:6] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[8:9] @@ -1546,14 +1542,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX10-NEXT: v_trunc_f32_e32 v4, v2 -; GFX10-NEXT: v_trunc_f32_e32 v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v5 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v5 -; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v2, v2 +; GFX10-NEXT: v_trunc_f32_e32 v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2 +; GFX10-NEXT: v_mul_f32_e32 v5, 0xcf800000, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s1, v7, 0 @@ -1662,119 +1658,119 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v4, s17, v4 ; GFX10-NEXT: v_mul_lo_u32 v9, s17, v2 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1 -; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, s17, v2 -; GFX10-NEXT: v_mul_lo_u32 v2, s18, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s18, v0 ; GFX10-NEXT: v_mul_hi_u32 v7, s18, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, s19, v1 ; GFX10-NEXT: v_mul_lo_u32 v12, s19, v0 -; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 -; GFX10-NEXT: v_mul_hi_u32 v14, s19, v0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v8 +; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1 +; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1 +; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0 +; GFX10-NEXT: v_mul_hi_u32 v10, s19, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX10-NEXT: v_mul_hi_u32 v2, s17, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v12, v7 +; GFX10-NEXT: v_add_co_u32 v7, s0, v4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s4, v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v10, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7 -; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11 -; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14 +; GFX10-NEXT: v_add_co_u32 v8, s0, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v7, 0 +; GFX10-NEXT: v_add3_u32 v9, v3, v4, v2 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v8, 0 +; GFX10-NEXT: v_add3_u32 v10, v6, v5, v10 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s4, v9, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v7, v[3:4] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v8, v[4:5] -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v7, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v9, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v10, v[3:4] +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v11, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s16, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s7, v10, v[5:6] -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s17, v3, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s17, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v15, s0, s18, v2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s19, v0, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v5 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s7, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v18 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v7, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v8, v[5:6] +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v3 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v3, s0, s17, v3, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v5, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v14, s4 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v0, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v17, s0, s18, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v18, s1, s19, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v15 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v0, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s19, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v17 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v18 -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v2, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v20, v5, s1 +; GFX10-NEXT: v_sub_co_u32 v2, s1, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, v5 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s1, 0, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v1, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, s7, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_sub_co_u32 v11, s0, v17, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s1, 0, v7, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v2, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s7, v23, s1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v9, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1 -; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[14:15] +; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v14, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v15, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, vcc_lo, s7, v7, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v5 +; GFX10-NEXT: v_sub_co_u32 v5, s0, v11, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v3, v6, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v14, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v17, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v18, v7, s0 +; GFX10-NEXT: global_store_dwordx4 v12, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index 003aa049b2d1b..ae83766cd6a4a 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -9181,12 +9181,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9209,12 +9209,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9230,20 +9230,20 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fsub_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9256,20 +9256,20 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fsub_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9304,13 +9304,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9334,13 +9334,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9356,22 +9356,22 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9384,22 +9384,22 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmax_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9434,13 +9434,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9464,13 +9464,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9486,22 +9486,22 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9514,22 +9514,22 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmin_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB122_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9565,13 +9565,13 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9594,12 +9594,12 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9615,49 +9615,49 @@ define void @flat_atomic_fmaximum_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmaximum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmaximum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9693,13 +9693,13 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9722,12 +9722,12 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9743,49 +9743,49 @@ define void @flat_atomic_fminimum_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fminimum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fminimum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10009,10 +10009,10 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -10020,39 +10020,39 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB128_6 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v5 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB128_3 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB128_3: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB128_5 ; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB128_5: ; %Flow1 ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB128_6: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB128_8 ; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc -; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: .LBB128_8: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] @@ -10095,14 +10095,14 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB129_4: ; %Flow3 @@ -10154,14 +10154,14 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB129_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB129_4: ; %Flow3 @@ -10195,51 +10195,51 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fsub_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB130_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB130_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB130_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB130_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB130_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB130_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10249,48 +10249,48 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB130_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB130_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB130_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB130_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB130_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB130_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -10710,14 +10710,14 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB135_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB135_4: ; %Flow2 @@ -10778,14 +10778,14 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB135_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB135_4: ; %Flow2 @@ -10823,59 +10823,59 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmaximum_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB136_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB136_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB136_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB136_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB136_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB136_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10885,58 +10885,58 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB136_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB136_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB136_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB136_4: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB136_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB136_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -10978,14 +10978,14 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB137_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB137_4: ; %Flow2 @@ -11046,14 +11046,14 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB137_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB137_4: ; %Flow2 @@ -11091,59 +11091,59 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fminimum_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB138_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB138_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB138_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB138_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB138_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB138_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -11153,58 +11153,58 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB138_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB138_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB138_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB138_4: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB138_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB138_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -17456,26 +17456,27 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB225_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17485,26 +17486,27 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB225_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v0, v1, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17521,20 +17523,20 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17548,20 +17550,20 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB226_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17582,29 +17584,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17614,29 +17616,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17653,29 +17655,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17683,29 +17685,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB228_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 @@ -17720,29 +17722,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17752,29 +17754,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17791,29 +17793,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17821,29 +17823,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB230_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 @@ -17858,29 +17860,30 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17890,26 +17893,27 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB231_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17926,29 +17930,30 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17956,20 +17961,20 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB232_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17990,29 +17995,30 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -18022,26 +18028,27 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB233_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -18058,29 +18065,30 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -18088,20 +18096,20 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB234_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18393,14 +18401,14 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB237_6 ; GFX90A-NEXT: .LBB237_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -18451,14 +18459,14 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB237_6 ; GFX950-NEXT: .LBB237_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -18496,43 +18504,43 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB238_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB238_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB238_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB238_6 ; GFX90A-NEXT: .LBB238_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB238_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB238_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -18547,40 +18555,40 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB238_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB238_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB238_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB238_6 ; GFX950-NEXT: .LBB238_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB238_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB238_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -18988,14 +18996,14 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB243_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB243_6 ; GFX90A-NEXT: .LBB243_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19055,14 +19063,14 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB243_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB243_6 ; GFX950-NEXT: .LBB243_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19104,51 +19112,51 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB244_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB244_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB244_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB244_6 ; GFX90A-NEXT: .LBB244_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB244_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB244_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -19163,50 +19171,50 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB244_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB244_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB244_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB244_6 ; GFX950-NEXT: .LBB244_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB244_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB244_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -19248,14 +19256,14 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB245_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB245_6 ; GFX90A-NEXT: .LBB245_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19315,14 +19323,14 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB245_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB245_6 ; GFX950-NEXT: .LBB245_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19364,51 +19372,51 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB246_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB246_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB246_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB246_6 ; GFX90A-NEXT: .LBB246_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB246_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB246_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -19423,50 +19431,50 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB246_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB246_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB246_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB246_6 ; GFX950-NEXT: .LBB246_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB246_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB246_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index 34a4899123749..c3531f16248e9 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -6023,12 +6023,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6051,12 +6051,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6072,20 +6072,20 @@ define void @global_atomic_fsub_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fsub_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6098,20 +6098,20 @@ define void @global_atomic_fsub_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fsub_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6146,13 +6146,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6176,13 +6176,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6198,22 +6198,22 @@ define void @global_atomic_fmax_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmax_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6226,22 +6226,22 @@ define void @global_atomic_fmax_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fmax_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6276,13 +6276,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6306,13 +6306,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6328,22 +6328,22 @@ define void @global_atomic_fmin_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmin_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6356,22 +6356,22 @@ define void @global_atomic_fmin_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fmin_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB122_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6407,13 +6407,13 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6436,12 +6436,12 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6457,49 +6457,49 @@ define void @global_atomic_fmaximum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmaximum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6535,13 +6535,13 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6564,12 +6564,12 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6585,49 +6585,49 @@ define void @global_atomic_fminimum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fminimum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6740,14 +6740,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -6770,14 +6770,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB129_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -6796,23 +6796,23 @@ define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB130_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6822,23 +6822,23 @@ define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: v_add_f64 v[2:3], v[4:5], -v[6:7] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB130_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -7020,14 +7020,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB135_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7055,14 +7055,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB135_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7080,28 +7080,28 @@ define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[10:11], v[2:3] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB136_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7110,29 +7110,29 @@ define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[4:5] -; GFX950-NEXT: v_max_f64 v[4:5], v[10:11], v[2:3] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB136_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -7164,14 +7164,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB137_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7199,14 +7199,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB137_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7224,28 +7224,28 @@ define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[4:5], v[10:11], v[2:3] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 glc +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB138_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7254,29 +7254,29 @@ define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[4:5] -; GFX950-NEXT: v_min_f64 v[4:5], v[10:11], v[2:3] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB138_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -11796,12 +11796,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11825,12 +11825,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11846,54 +11846,54 @@ define void @global_atomic_fsub_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fsub_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fsub_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v0, v1, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB226_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -11923,13 +11923,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11954,13 +11954,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11976,58 +11976,58 @@ define void @global_atomic_fmax_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fmax_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmax_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB228_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12057,13 +12057,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12088,13 +12088,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12110,58 +12110,58 @@ define void @global_atomic_fmin_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fmin_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmin_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB230_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12192,13 +12192,13 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12222,12 +12222,12 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12243,57 +12243,57 @@ define void @global_atomic_fmaximum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fmaximum_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v1 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v2, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB232_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12324,13 +12324,13 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12354,12 +12354,12 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12375,57 +12375,57 @@ define void @global_atomic_fminimum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fminimum_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_min_f32_e32 v3, v5, v1 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v2, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB234_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12536,14 +12536,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12567,14 +12567,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -12590,54 +12590,54 @@ define void @global_atomic_fsub_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fsub_f64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB238_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB238_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fsub_f64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB238_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB238_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12826,14 +12826,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB243_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12862,14 +12862,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB243_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -12885,63 +12885,63 @@ define void @global_atomic_fmaximum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fmaximum_f64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB244_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB244_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_f64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB244_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_max_f64 v[2:3], v[8:9], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB244_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12974,14 +12974,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB245_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -13010,14 +13010,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB245_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -13033,63 +13033,63 @@ define void @global_atomic_fminimum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fminimum_f64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB246_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB246_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_f64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB246_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_min_f64 v[2:3], v[8:9], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB246_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 6a95881067b93..ff74d1f71616d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2147,12 +2147,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -2190,12 +2190,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i64_uniform: @@ -6208,10 +6208,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s8, v4, 0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[3:4] ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s7, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 08a4f0cdad18f..f5ca24f59a286 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1889,13 +1889,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -1926,13 +1926,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel @@ -5182,13 +5182,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX9-NEXT: .LBB13_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], s2, v2, 0 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s8, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 3c316f4acedb7..89c3e8d615efd 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -381,17 +381,17 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX90A-LABEL: no_unsafe: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -836,7 +836,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: s_getpc_b64 s[4:5] ; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v2, s[4:5] +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v2, s[4:5] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_branch .LBB5_7 ; GFX90A-NEXT: .LBB5_6: ; %Flow @@ -846,7 +846,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: .LBB5_7: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX90A-NEXT: s_getpc_b64 s[6:7] ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 @@ -856,6 +855,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_7 ; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end1 @@ -926,7 +926,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: s_getpc_b64 s[0:1] ; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[0:1] +; GFX942-NEXT: global_load_dwordx2 v[4:5], v2, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_branch .LBB5_7 ; GFX942-NEXT: .LBB5_6: ; %Flow @@ -936,7 +936,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: .LBB5_7: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX942-NEXT: s_getpc_b64 s[2:3] ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 @@ -946,6 +945,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_7 ; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end1 diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 37040123ee20c..42f76c4a10d2a 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0 ; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 ; CHECK-NEXT: v_mov_b32_e32 v20, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 @@ -16,12 +16,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_bitcmp1_b32 s0, 8 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 ; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 ; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6 ; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f ; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90 @@ -37,8 +35,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883 ; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4 ; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0x57b87036 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fb3b136 +; CHECK-NEXT: v_mov_b32_e32 v16, 0x57b87036 +; CHECK-NEXT: v_mov_b32_e32 v17, 0x3fb3b136 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523 ; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555 @@ -64,10 +62,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25] -; CHECK-NEXT: v_accvgpr_read_b32 v27, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a2 +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[0:1] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3] -; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[0:1] ; CHECK-NEXT: v_accvgpr_write_b32 a0, 0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -85,9 +81,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[14:15] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] -; CHECK-NEXT: v_fmac_f64_e32 v[16:17], 0, v[28:29] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[18:19] -; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[16:17] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[16:17] +; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[18:19] +; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 @@ -96,8 +93,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329 ; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a0 ; CHECK-NEXT: s_mov_b64 s[24:25], -1 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_mov_b64 vcc, s[2:3] @@ -112,9 +109,9 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v29 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 @@ -132,13 +129,13 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cbranch_vccz .LBB0_13 ; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[28:29] +; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[26:27] ; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17] ; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17] -; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] -; CHECK-NEXT: v_mov_b32_e32 v17, v16 +; CHECK-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[8:9] +; CHECK-NEXT: v_mov_b32_e32 v27, v26 ; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17] -; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13] +; CHECK-NEXT: global_store_dwordx2 v20, v[26:27], s[12:13] ; CHECK-NEXT: s_cselect_b32 s23, s23, 0 ; CHECK-NEXT: s_cselect_b32 s22, s22, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll index b80aa9324e616..4909fac4f307d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll @@ -18,7 +18,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -53,7 +53,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -89,7 +89,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -127,7 +127,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -170,7 +170,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -217,7 +217,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -265,7 +265,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -315,7 +315,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll index 2ce54f8a463c7..30536b18674b5 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll @@ -108,7 +108,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 @@ -136,7 +136,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 @@ -164,7 +164,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 @@ -194,7 +194,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 @@ -340,7 +340,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 @@ -374,7 +374,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 @@ -408,7 +408,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 @@ -444,7 +444,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 142290a39f8f4..361bc78759bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2382,17 +2382,17 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; SDAG-NEXT: v_lshrrev_b32_e32 v14, 8, v0 ; SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] ; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v14 ; SDAG-NEXT: v_mov_b32_e32 v2, v13 ; SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index d7d697ef85b9f..00baf0a44368d 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -1026,102 +1026,100 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) { ; GFX9-NEXT: v_or_b32_e32 v5, v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v11, vcc, 0, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v0, vcc -; GFX9-NEXT: v_madmk_f32 v1, v3, 0x4f800000, v1 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_madmk_f32 v1, v3, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v10 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 -; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v9, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v13, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v10, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v13 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 -; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, 0 -; GFX9-NEXT: v_mul_hi_u32 v12, v1, v3 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v3, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v12, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v11, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v0 +; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v0, vcc +; GFX9-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v12, v9, v4 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v8, v7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v8, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v1, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v5, 0 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v10, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v0, v1 -; GFX9-NEXT: v_mul_lo_u32 v9, v6, v5 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v1, 0 -; GFX9-NEXT: v_add3_u32 v4, v4, v9, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, v2, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v7, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v8, v0, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v3, v6 -; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, v3, v7 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v6, 0 +; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, v2, v5 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v8, v0, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v1, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v8, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v7, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 -; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v9, s[4:5] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB10_2: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 9c03c850c8242..0e86a1ac68119 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -421,19 +421,19 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1 +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB18_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end @@ -458,19 +458,19 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1 +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB19_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll index 613fdf388c0f1..0f45e99dd76c4 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll @@ -64,13 +64,11 @@ define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, ptr %arg2) { ; CHECK-LABEL: mullohi_2xu32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v1, 0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, v1 -; CHECK-NEXT: v_mov_b32_e32 v7, v3 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 -; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; CHECK-NEXT: v_mov_b32_e32 v2, v7 +; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v1, v6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll index 54cbc25043db3..e841ec43fd064 100644 --- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll +++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll @@ -193,14 +193,13 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 3, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 3, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3] +; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, 1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 3, 0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 3, v[3:4] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: urem64_3: @@ -238,14 +237,13 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] ; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] -; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 -; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 3, v[2:3] -; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3] +; GFX1030-NEXT: v_alignbit_b32 v2, v4, v3, 1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 3, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 3, v[3:4] +; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: %rem = urem i64 %i, 3 @@ -265,14 +263,13 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 2 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 6, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 6, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3] +; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, 2 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 6, 0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 6, v[3:4] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: urem64_6: @@ -310,14 +307,13 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] ; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] -; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 2 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 6, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 -; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 6, v[2:3] -; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3] +; GFX1030-NEXT: v_alignbit_b32 v2, v4, v3, 2 +; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 6, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 6, v[3:4] +; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: %rem = urem i64 %i, 6 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 26f77898faf60..ddac86b3719c2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -1953,68 +1953,66 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0 -; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2 -; SDAG-NEXT: v_mul_lo_u32 v25, v34, v31 -; SDAG-NEXT: v_mul_lo_u32 v34, v32, v30 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0 -; SDAG-NEXT: v_mov_b32_e32 v15, 0 -; SDAG-NEXT: v_mul_lo_u32 v38, v12, v7 -; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v6, 0 -; SDAG-NEXT: v_mul_lo_u32 v39, v13, v6 +; SDAG-NEXT: v_mul_lo_u32 v15, v27, v2 +; SDAG-NEXT: v_mul_lo_u32 v23, v34, v31 +; SDAG-NEXT: v_mul_lo_u32 v24, v32, v30 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v31, v33, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mul_lo_u32 v25, v12, v7 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0 +; SDAG-NEXT: v_mul_lo_u32 v34, v13, v6 ; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 -; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v12, 0 +; SDAG-NEXT: v_mul_lo_u32 v38, v18, v36 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; SDAG-NEXT: v_mov_b32_e32 v14, v3 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15] -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v2 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v38 -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v24 -; SDAG-NEXT: v_mov_b32_e32 v14, v22 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[14:15] -; SDAG-NEXT: v_xor_b32_e32 v24, v16, v28 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v39 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v33, v[21:22] +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v20 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v25 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; SDAG-NEXT: v_mov_b32_e32 v21, v6 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v31, v27, v[21:22] +; SDAG-NEXT: v_xor_b32_e32 v16, v16, v28 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v34 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11] -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v3 -; SDAG-NEXT: v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v31, vcc, v17, v2, vcc -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] -; SDAG-NEXT: v_mov_b32_e32 v14, v7 -; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v12, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v25, v11 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[22:23] -; SDAG-NEXT: v_xor_b32_e32 v18, v31, v29 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v7, v15 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v17, v14, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v37, v12, 0 +; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v23, v11 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v27, v[6:7] +; SDAG-NEXT: v_xor_b32_e32 v17, v14, v29 ; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 -; SDAG-NEXT: v_mov_b32_e32 v14, v16 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v34, v7 -; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15 -; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v36, v13, v[15:16] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc -; SDAG-NEXT: v_xor_b32_e32 v7, v0, v28 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; SDAG-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v24, v28 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v29, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v7, v28, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v14, vcc -; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc -; SDAG-NEXT: v_xor_b32_e32 v7, v7, v35 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v36, v12, v[21:22] +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v24, v15 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v38, v3 +; SDAG-NEXT: v_mov_b32_e32 v21, v11 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[21:22] +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], v7, v18, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v12, v15 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, v0, v28 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v36, v13, v[6:7] +; SDAG-NEXT: v_xor_b32_e32 v11, v1, v29 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v16, v28 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v2 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v3, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v17, v29, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v10, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v11, v29, vcc +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v20 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v14, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, v8, v26 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v9, v35 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc +; SDAG-NEXT: v_xor_b32_e32 v7, v4, v26 ; SDAG-NEXT: v_xor_b32_e32 v9, v5, v35 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v7, v35, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v8, v26, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v8, v26 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v6, v35, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v7, v26, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v35, vcc ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -2407,44 +2405,41 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v27, v30, v19 -; GISEL-NEXT: v_mul_lo_u32 v36, v29, v18 -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0 -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0 -; GISEL-NEXT: v_mul_lo_u32 v37, v35, v3 -; GISEL-NEXT: v_mul_lo_u32 v38, v34, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15] -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23] -; GISEL-NEXT: v_mov_b32_e32 v22, v19 -; GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3] -; GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15] -; GISEL-NEXT: v_mov_b32_e32 v2, v23 +; GISEL-NEXT: v_mul_lo_u32 v26, v30, v19 +; GISEL-NEXT: v_mul_lo_u32 v27, v29, v18 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v20, 0 +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v2, 0 +; GISEL-NEXT: v_mul_lo_u32 v36, v35, v3 +; GISEL-NEXT: v_mul_lo_u32 v37, v34, v2 +; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v29, v32, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[18:19] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[24:25] +; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v4, v20, v[14:15] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2] -; GISEL-NEXT: v_mov_b32_e32 v23, v25 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15] -; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[23:24] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v29, v31, v[14:15] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v26, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2] -; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v36, vcc +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v25, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v27, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v22, vcc -; GISEL-NEXT: v_xor_b32_e32 v19, v0, v28 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v18, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v28 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v37, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v22 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v18, v2, v33 +; GISEL-NEXT: v_xor_b32_e32 v22, v2, v33 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1] ; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33 ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v19, v28 +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v18, v28 ; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v18, v33 +; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v22, v33 ; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9] -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v23, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v19, vcc ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5] @@ -2815,52 +2810,50 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 ; SDAG-NEXT: .LBB3_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v32, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 -; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 -; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mul_lo_u32 v34, v16, v15 -; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v14, 0 -; SDAG-NEXT: v_mul_lo_u32 v35, v17, v14 -; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 -; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v16, 0 -; SDAG-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; SDAG-NEXT: v_mov_b32_e32 v20, v11 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21] -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v34 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v19, v28 -; SDAG-NEXT: v_mov_b32_e32 v20, v26 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v35 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v31, v8, v[18:19] -; SDAG-NEXT: v_add_i32_e64 v26, s[4:5], v27, v11 -; SDAG-NEXT: v_addc_u32_e64 v27, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] -; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v13, v16, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v29, v19 -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[26:27] -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v33, v19 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v36, v22 -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v16, v12 -; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v18 -; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v19, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v21, v32, v11 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v32, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v26, v30, v10 +; SDAG-NEXT: v_mul_lo_u32 v27, v33, v8 +; SDAG-NEXT: v_mul_lo_u32 v28, v31, v9 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v32, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mul_lo_u32 v29, v16, v15 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v33, v17, v14 +; SDAG-NEXT: v_mul_lo_u32 v34, v23, v12 +; SDAG-NEXT: v_mul_lo_u32 v35, v22, v13 +; SDAG-NEXT: v_add_i32_e32 v21, vcc, v25, v21 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v9, v32, v[19:20] +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v18 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v29 +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v21, v26 +; SDAG-NEXT: v_mov_b32_e32 v19, v14 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v30, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v33 +; SDAG-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v31, v8, v[24:25] +; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v15, v19 +; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v18, vcc +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[10:11] +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v16, 0 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v27, v24 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[14:15] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v34, v11 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v13, v16, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v28, v21 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v35, v11 +; SDAG-NEXT: v_mov_b32_e32 v19, v14 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 +; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[15:16] -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v15, v12 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[8:9] ; SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 +; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v21, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v18 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc @@ -3223,18 +3216,16 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0 ; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19 ; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18 -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27] -; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19] -; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23] -; GISEL-NEXT: v_mov_b32_e32 v18, v26 -; GISEL-NEXT: v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18] -; GISEL-NEXT: v_mov_b32_e32 v22, v28 +; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v9, v33, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[30:31], s[4:5], v13, v25, v[26:27] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[28:29] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[30:31] +; GISEL-NEXT: v_mad_u64_u32 v[26:27], vcc, v8, v33, v[17:18] ; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22] -; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31] -; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[26:27] +; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v19, v34, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18] -; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v23, v36, s[6:7] ; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 1a4a54b81c78f..5c4e25c3120e9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -4448,18 +4448,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4771,18 +4771,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5462,18 +5462,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 59b0537b817d2..53e9468c5d5b6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -34,24 +34,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -79,23 +79,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -177,24 +177,24 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -224,23 +224,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -324,21 +324,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -347,6 +344,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -386,20 +384,20 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -950,24 +948,24 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -997,25 +995,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1252,24 +1250,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1331,23 +1329,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1445,24 +1443,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1490,23 +1488,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1592,24 +1590,24 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1637,23 +1635,23 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1735,24 +1733,24 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1782,23 +1780,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1882,21 +1880,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -1905,6 +1900,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1944,20 +1940,20 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2508,24 +2504,24 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2555,25 +2551,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5415,54 +5411,50 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_6 -; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB24_2 -; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index c9c9f332fe391..5ee3ff67aa8a0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -34,24 +34,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -79,23 +79,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -177,24 +177,24 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -224,23 +224,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -324,21 +324,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -347,6 +344,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -386,20 +384,20 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -950,24 +948,24 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -997,25 +995,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1252,24 +1250,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1331,23 +1329,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1445,24 +1443,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1490,23 +1488,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1592,24 +1590,24 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1637,23 +1635,23 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1735,24 +1733,24 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1782,23 +1780,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1882,21 +1880,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -1905,6 +1900,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1944,20 +1940,20 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2508,24 +2504,24 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2555,25 +2551,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5415,54 +5411,50 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_6 -; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB24_2 -; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 587c2ea885077..0e563c26d27ea 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -49,12 +49,11 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 @@ -62,6 +61,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -122,18 +122,18 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -245,12 +245,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 @@ -258,6 +257,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -319,18 +319,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -445,27 +445,25 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -533,18 +531,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1254,12 +1252,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 @@ -1267,6 +1264,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1328,12 +1326,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc @@ -1342,6 +1339,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1656,12 +1654,11 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 @@ -1669,6 +1666,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1729,18 +1727,18 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1852,12 +1850,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 @@ -1865,6 +1862,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1926,18 +1924,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2052,27 +2050,25 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2140,18 +2136,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2861,12 +2857,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 @@ -2874,6 +2869,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2935,12 +2931,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc @@ -2949,6 +2944,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3295,12 +3291,11 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB16_4 ; GFX942-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX942-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 @@ -3308,6 +3303,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB16_2 ; GFX942-NEXT: ; %bb.3: ; %Flow @@ -3433,51 +3429,47 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow3 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_6 -; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB16_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB16_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB16_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB16_2 -; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB16_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3713,8 +3705,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, s[2:3] +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v9 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -3726,31 +3718,31 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB17_3: ; %atomicrmw.global -; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] -; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB17_4 ; GFX942-NEXT: ; %bb.5: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB17_2 ; GFX942-NEXT: .LBB17_6: ; %atomicrmw.private -; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] @@ -3870,10 +3862,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0x7f8, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v9 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3885,29 +3877,29 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 ; GFX90A-NEXT: ; %bb.5: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB17_2 ; GFX90A-NEXT: .LBB17_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4160,8 +4152,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: s_movk_i32 s2, 0xf800 ; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, s[2:3] +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v9 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4173,31 +4165,31 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global -; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] -; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 ; GFX942-NEXT: ; %bb.5: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB18_2 ; GFX942-NEXT: .LBB18_6: ; %atomicrmw.private -; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] @@ -4317,10 +4309,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v9 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4332,29 +4324,29 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: .LBB18_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 357234080235a..8a4b2c428e31a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -12687,20 +12687,19 @@ define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12812,20 +12811,19 @@ define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 460f1211d1386..0c4a15f6a9d5e 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -61,34 +61,32 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v4, v1 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] +; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB0_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 @@ -119,10 +117,11 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup @@ -234,37 +233,36 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: .LBB0_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 @@ -276,16 +274,15 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 -; GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2] ; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB0_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] @@ -429,34 +426,32 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v4, v1 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] +; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB1_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 @@ -487,10 +482,11 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup @@ -602,37 +598,36 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: .LBB1_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB1_6 @@ -644,16 +639,15 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 -; GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2] ; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB1_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] @@ -798,30 +792,30 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v9, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v11, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB2_4: ; %Flow @@ -836,9 +830,9 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB2_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -847,10 +841,11 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup @@ -961,37 +956,36 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB2_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 @@ -1004,11 +998,11 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: .LBB2_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB2_7: ; %Flow2 @@ -1152,30 +1146,30 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v9, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v11, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB3_4: ; %Flow @@ -1190,9 +1184,9 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB3_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -1201,10 +1195,11 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup @@ -1315,37 +1310,36 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB3_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 @@ -1358,11 +1352,11 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: .LBB3_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB3_7: ; %Flow2 @@ -1545,28 +1539,28 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v9, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v11, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB6_4: ; %Flow @@ -1590,10 +1584,11 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB6_10: ; %fp-to-i-cleanup @@ -1705,33 +1700,32 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB6_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffff3a, v5 +; GISEL-NEXT: v_sub_u32_e32 v3, 64, v2 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[3:4], v3, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[5:6], v5, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11] ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 @@ -1893,28 +1887,28 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v9, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v11, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB7_4: ; %Flow @@ -1938,10 +1932,11 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB7_10: ; %fp-to-i-cleanup @@ -2053,33 +2048,32 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB7_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffff3a, v5 +; GISEL-NEXT: v_sub_u32_e32 v3, 64, v2 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[3:4], v3, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[5:6], v5, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11] ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index bd9fe397bfc68..aede3928d7b0f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -46,7 +46,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] ; GFX90A-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.1 ; GFX90A-NEXT: {{ $}} @@ -76,12 +76,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_dpp6]] ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} @@ -89,28 +90,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: successors: %bb.4(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 - ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY %2 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY10]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:av_32 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX90A-NEXT: S_BRANCH %bb.4 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 + ; GFX90A-NEXT: [[PHI:%[0-9]+]]:av_32 = PHI [[DEF]], %bb.0, %8, %bb.4 ; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4 (%ir-block.35): ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[COPY11]], %bb.2 ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX90A-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec + ; GFX90A-NEXT: early-clobber %48:vgpr_32 = STRICT_WWM [[COPY8]], implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %48, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]] + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY13]], [[COPY12]], implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:av_32 = COPY [[V_CNDMASK_B32_e64_]] ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5 (%ir-block.41): @@ -128,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] ; GFX942-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE - ; GFX942-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX942-NEXT: [[DEF:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX942-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.1 ; GFX942-NEXT: {{ $}} @@ -158,12 +161,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX942-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_dpp6]] ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 ; GFX942-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX942-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX942-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX942-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX942-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX942-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.2 ; GFX942-NEXT: {{ $}} @@ -171,28 +175,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: successors: %bb.4(0x80000000) ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 - ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY %2 + ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY10]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: [[COPY11:%[0-9]+]]:av_32 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX942-NEXT: S_BRANCH %bb.4 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.3.Flow: ; GFX942-NEXT: successors: %bb.5(0x80000000) ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 + ; GFX942-NEXT: [[PHI:%[0-9]+]]:av_32 = PHI [[DEF]], %bb.0, %8, %bb.4 ; GFX942-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.5 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.4 (%ir-block.35): ; GFX942-NEXT: successors: %bb.3(0x80000000) ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[COPY11]], %bb.2 ; GFX942-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX942-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec - ; GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] - ; GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec + ; GFX942-NEXT: early-clobber %47:vgpr_32 = STRICT_WWM [[COPY8]], implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY12:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]] + ; GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY13]], [[COPY12]], implicit $exec + ; GFX942-NEXT: [[COPY14:%[0-9]+]]:av_32 = COPY [[V_CNDMASK_B32_e64_]] ; GFX942-NEXT: S_BRANCH %bb.3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.5 (%ir-block.41): diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 02884559bdaa9..3d0ebc72791bd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -38,33 +38,35 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY6]], 0, 0, implicit $exec :: (load (s64) from %ir.ptr, addrspace 1) ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.atomicrmw.start: ; GFX90A-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.0, %3, %bb.1 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY7]], %bb.0, %3, %bb.1 ; GFX90A-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI1]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 - ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 - ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 - ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 - ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]] - ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY11]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3 + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY12]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U64_e64 [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], [[PHI1]], implicit $exec + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] ; GFX90A-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U64_e64_]], [[PHI]], implicit-def dead $scc ; GFX90A-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2.atomicrmw.end: - ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], %bb.1 + ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY13]], %bb.1 ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec - ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec + ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index a50791e10f5a2..1978e68fdae9c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -90,18 +90,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -287,18 +287,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -486,18 +486,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1283,12 +1283,11 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -1297,6 +1296,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1697,18 +1697,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1912,18 +1912,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2319,18 +2319,18 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2719,18 +2719,18 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3712,18 +3712,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4130,18 +4130,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4532,18 +4532,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6724,18 +6724,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 311faac1b7c29..dc995fb7ef79c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -35,24 +35,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -80,23 +80,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -197,24 +197,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -242,23 +242,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -361,24 +361,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -406,23 +406,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1000,24 +1000,24 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1045,25 +1045,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1328,24 +1328,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1407,23 +1407,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1559,24 +1559,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1604,23 +1604,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1725,24 +1725,24 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1770,23 +1770,23 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1887,24 +1887,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1932,23 +1932,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2051,24 +2051,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2096,23 +2096,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2690,24 +2690,24 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2735,25 +2735,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -4122,24 +4122,24 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index e2808ee9bf706..f62e13a9d4341 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -35,24 +35,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -80,23 +80,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -197,24 +197,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -242,23 +242,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -361,24 +361,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -406,23 +406,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1000,24 +1000,24 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1045,25 +1045,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1328,24 +1328,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1407,23 +1407,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1559,24 +1559,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1604,23 +1604,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1725,24 +1725,24 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1770,23 +1770,23 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1887,24 +1887,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1932,23 +1932,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2051,24 +2051,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2096,23 +2096,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2690,24 +2690,24 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2735,25 +2735,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -4122,24 +4122,24 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 11f0f38d2b6fa..9e6f0fd7f13b5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -50,12 +50,11 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 @@ -63,6 +62,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -123,18 +123,18 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -282,12 +282,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 @@ -295,6 +294,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -355,18 +355,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -516,12 +516,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 @@ -529,6 +528,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -589,18 +589,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1438,12 +1438,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 @@ -1451,6 +1450,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1511,12 +1511,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -1525,6 +1524,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1905,12 +1905,11 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 @@ -1918,6 +1917,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1978,18 +1978,18 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2137,12 +2137,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 @@ -2150,6 +2149,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2210,18 +2210,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2371,12 +2371,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 @@ -2384,6 +2383,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2444,18 +2444,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3293,12 +3293,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 @@ -3306,6 +3305,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3366,12 +3366,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -3380,6 +3379,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3760,12 +3760,11 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 @@ -3773,6 +3772,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3836,18 +3836,18 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4012,12 +4012,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0 @@ -4025,6 +4024,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4088,18 +4088,18 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4265,12 +4265,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0 @@ -4278,6 +4277,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4341,18 +4341,18 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll index 4aa49f2c9296d..1db476300c261 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll +++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @foo() { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 -; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 -; CHECK-NEXT: flat_store_b64 v[0:1], v[2:3] +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: flat_store_b64 v[1:2], v[0:1] ; CHECK-NEXT: s_endpgm entry: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 10d61deed71cc..76f204dd0c16a 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -82,9 +82,9 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 ; CHECK-NEXT: v_writelane_b32 v7, s67, 31 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s52, v7, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_readlane_b32 s53, v7, 1 ; CHECK-NEXT: v_readlane_b32 s54, v7, 2 ; CHECK-NEXT: v_readlane_b32 s55, v7, 3 @@ -97,14 +97,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s60, v7, 8 ; CHECK-NEXT: v_readlane_b32 s61, v7, 9 ; CHECK-NEXT: v_readlane_b32 s62, v7, 10 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s63, v7, 11 ; CHECK-NEXT: v_readlane_b32 s64, v7, 12 ; CHECK-NEXT: v_readlane_b32 s65, v7, 13 ; CHECK-NEXT: v_readlane_b32 s66, v7, 14 ; CHECK-NEXT: v_readlane_b32 s67, v7, 15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 +; CHECK-NEXT: v_mul_f32_e32 v0, v4, v1 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 @@ -118,13 +118,13 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s65, v7, 29 ; CHECK-NEXT: v_readlane_b32 s66, v7, 30 ; CHECK-NEXT: v_readlane_b32 s67, v7, 31 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 ; CHECK-NEXT: v_readlane_b32 s53, v7, 17 ; CHECK-NEXT: v_readlane_b32 s54, v7, 18 ; CHECK-NEXT: v_readlane_b32 s55, v7, 19 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s56, v7, 20 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: v_readlane_b32 s57, v7, 21 ; CHECK-NEXT: v_readlane_b32 s58, v7, 22 ; CHECK-NEXT: v_readlane_b32 s59, v7, 23 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index f705a2ffc4f1d..5e2cec504c6a9 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5779,19 +5779,17 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] -; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v8 +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v8 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] ; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] @@ -5835,19 +5833,17 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, v0, v8 +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v8 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] ; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] @@ -5887,19 +5883,17 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v8 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] @@ -5989,28 +5983,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX10-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, v6 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v4, v7, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v3, v[1:2] ; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v8, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v5, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v7, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v6, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v10, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v8, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v4, v7, v[1:2] +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v2, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2] +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v1, v2, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v6, v9, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6] +; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v4, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v9, v2, v[7:8] +; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v4, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v4, v0, v[6:7] +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v2, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v5, v7, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v2, v8, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v4, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v2, v7, v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6049,37 +6041,35 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v3, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, v[4:5] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v4, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v5, v8, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2] +; GFX11-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v1, v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v6, v9, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6] ; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v5, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v6, v11, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v4, v11, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v10, v2, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v6, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v12, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v9, 0 +; GFX11-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v4, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, v[6:7] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, 0 ; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v11, v[4:5] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v9, v[6:7] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v8, v11, v[2:3] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v5, v10, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[6:7] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6408,52 +6398,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v12 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v16 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] -; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v13 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v16 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX7-GISEL-NEXT: v_add_i32_e64 v16, s[4:5], 1, v8 +; GFX7-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6513,52 +6497,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v12 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] -; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v16 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v9 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, v1, v16 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX8-GISEL-NEXT: v_add_u32_e64 v16, s[4:5], 1, v8 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6610,52 +6588,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v12 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v10, v13, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v12, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v16 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v8, v17, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v12, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v8, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v9 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, v1, v16 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v17, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v1 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v11, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX900-GISEL-NEXT: v_add_co_u32_e64 v16, s[4:5], 1, v8 +; GFX900-GISEL-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v14, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6805,50 +6777,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v2, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v4, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v14, v6, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v12, v5, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v14, v7, v[3:4] -; GFX10-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v13, v4, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v3, v4, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, v10, v13, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v2, v14 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v6, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v8, v15, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v10, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v11, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v5, v[0:1] -; GFX10-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v17, v7, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v8, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v15, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v16, v4, v[14:15] -; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v11, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v18, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v11, v20, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v13, v12, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v13, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v5, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v4, v19, v[10:11] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v7, v17, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v15, v[11:12] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v16, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v7, v18, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v12, v14, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v4, v17, v[7:8] +; GFX10-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v4, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4] +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v11 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v1, v4, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, v16 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v13, v17, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v16, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v16, v7, v[11:12] +; GFX10-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v11, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v8, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v6, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v10, v12, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v16, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v10, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v14, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v8, v0, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v17, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v10, v1, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v13, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v16, v11, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v4, v15, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v14, v12, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v6, v18, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v7, v13, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v8, v17, v[9:10] ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6911,63 +6879,60 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v13, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v4, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v15, v6, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v5, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v15, v7, v[3:4] -; GFX11-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v14, v4, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v16, v6, v[9:10] -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, v10, v14, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v15 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v3, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v11, v16, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v18, v6, 0 -; GFX11-GISEL-NEXT: v_add_co_u32 v20, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v8, v20, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v5, v[0:1] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v7, v[1:2] -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v11, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4] +; GFX11-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v11 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v17, v4, v[13:14] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v12, v18, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v19, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v21, v[0:1] -; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v15, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v12, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v12, v22, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v17, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v15, v20, v[6:7] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v10, v18, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v9, v16, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v4, v19, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v1, v4, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v16 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v13, v17, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v18, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v18, v7, v[11:12] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15] +; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v11, v14, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v17, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v14, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v19, v6, v[15:16] +; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v8, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v10, v15, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v16, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v10, 1 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v17, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v8, v0, v[5:6] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v10, v1, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v13, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v19, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v14, v[11:12] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v17, v15, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v18, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v6, v20, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v13, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v10, v19, v[11:12] ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 69a871f6f6ae5..fa0568d307907 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -262,12 +262,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v2: @@ -275,12 +275,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v2: @@ -290,13 +290,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v2: @@ -306,13 +306,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -462,13 +462,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v3: @@ -476,13 +475,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -492,13 +490,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; ALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -508,13 +506,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; UNALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -606,36 +604,33 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; SPLIT-NEXT: ds_read_b96 v[0:2], v5 +; SPLIT-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; SPLIT-NEXT: ds_read_b96 v[1:3], v4 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v3, v0 -; SPLIT-NEXT: v_mov_b32_e32 v4, v1 -; SPLIT-NEXT: ds_write_b96 v5, v[2:4] +; SPLIT-NEXT: v_mov_b32_e32 v0, v3 +; SPLIT-NEXT: ds_write_b96 v4, v[0:2] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_local_aligned_v3: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; ALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; ALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_local_aligned_v3: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; UNALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; UNALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -644,11 +639,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; ALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; ALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -657,11 +652,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; UNALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; UNALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -683,12 +678,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; SPLIT-NEXT: v_add_co_u32 v3, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v2 -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 +; SPLIT-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_aligned_v2: @@ -696,12 +691,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_aligned_v2: @@ -709,12 +704,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_aligned_v2: @@ -724,13 +719,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_aligned_v2: @@ -740,13 +735,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -942,21 +937,19 @@ define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) { ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 8 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: v_add_co_u32 v6, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v7, s0, s1, 0, s0 +; SPLIT-NEXT: v_add_co_u32 v8, vcc_lo, v6, 8 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo ; SPLIT-NEXT: s_clause 0x1 -; SPLIT-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; SPLIT-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[6:7] +; SPLIT-NEXT: flat_load_dwordx2 v[3:4], v[8:9] ; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; SPLIT-NEXT: v_mov_b32_e32 v8, v5 -; SPLIT-NEXT: v_mov_b32_e32 v9, v4 +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 ; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v7 -; SPLIT-NEXT: v_mov_b32_e32 v5, v6 -; SPLIT-NEXT: flat_store_dwordx2 v[2:3], v[8:9] -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; SPLIT-NEXT: v_mov_b32_e32 v5, v3 +; SPLIT-NEXT: flat_store_dwordx2 v[8:9], v[1:2] +; SPLIT-NEXT: flat_store_dwordx2 v[6:7], v[4:5] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_v4_aligned8: diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir index 02eda2c4822c2..e5b68b48158da 100644 --- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -5,7 +5,35 @@ # source. # No more registers shall be defined --- -name: main +name: limit_coalesce +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr16, $sgpr17 + + ; CHECK-LABEL: name: limit_coalesce + ; CHECK: liveins: $sgpr16, $sgpr17 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:sgpr_64 = COPY $sgpr17 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr16 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: INLINEASM &"; def $0", 0 /* attdialect */, 3407882 /* regdef:VReg_64 */, def %4 + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_128 = COPY %4.sub1 + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: SI_RETURN + %0:sgpr_32 = COPY killed $sgpr17 + %1:sgpr_32 = COPY killed $sgpr16 + undef %2.sub0:sgpr_64 = COPY killed %1 + %2.sub1:sgpr_64 = COPY killed %0 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + INLINEASM &"; def $0", 0 /* attdialect */, 3407882 /* regdef:VReg_64 */, def %4:vreg_64 + undef %5.sub0:vreg_128 = COPY killed %4.sub1 + GLOBAL_STORE_DWORDX4_SADDR killed %3, killed %5, killed %2, 0, 0, implicit $exec :: (store (s128), addrspace 1) + SI_RETURN +... + +--- +name: allow_coalesce tracksRegLiveness: true registers: - { id: 0, class: sreg_32_xm0, preferred-register: '%0' } @@ -14,23 +42,18 @@ body: | bb.0: liveins: $sgpr0, $vgpr0_vgpr1 - ; CHECK-LABEL: name: main + ; CHECK-LABEL: name: allow_coalesce ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $sgpr0 - ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[DEF]].sub0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_96 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF1]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_96 = COPY [[DEF]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $sgpr0 + ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY]].sub0_sub1, 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF]] + ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_128 = COPY undef [[COPY]].sub2 + ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr %2:vreg_64 = IMPLICIT_DEF undef %3.sub0:vreg_64 = COPY $sgpr0 %3.sub1:vreg_64 = COPY %2.sub0 @@ -49,3 +72,4 @@ body: | FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr ... + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 8bb7274c84620..76b97e843d777 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -78,7 +78,6 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 @@ -93,12 +92,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[1:2] ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -113,16 +112,15 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] +; CI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 -; CI-NEXT: v_mov_b32_e32 v2, v1 -; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[1:2] ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 59f4a9d44bbdd..d23c49165ec70 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5985,14 +5985,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -6011,23 +6010,22 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: @@ -6044,11 +6042,10 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index 062a985dd7180..b4c0b7497b95f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -5734,20 +5734,19 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: v_mov_b32_e32 v9, s0 +; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v5, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v7, v4, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; SI-NEXT: ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v9, v[5:6], v[7:8] offset1:1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 +; SI-NEXT: v_bfe_i32 v4, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; SI-NEXT: ds_write2_b64 v8, v[4:5], v[2:3] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v8, v[0:1], v[6:7] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5757,20 +5756,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[1:2], v[7:8] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5779,20 +5778,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[1:2], v[7:8] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[3:4], v[5:6] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v4i16_to_v4i64: @@ -5846,22 +5845,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 -; VI-DS128-NEXT: ds_read_b64 v[0:1], v0 +; VI-DS128-NEXT: ds_read_b64 v[1:2], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] +; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 +; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5869,22 +5867,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-DS128-NEXT: ds_read_b64 v[1:2], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(3) %in %ext = sext <4 x i16> %load to <4 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 929bb61ddabcf..bfce9bcac22c7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -50,17 +50,17 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX942-LABEL: local_atomic_fsub_ret_f32: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -119,17 +119,17 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX90A-LABEL: local_atomic_fsub_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -262,17 +262,17 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-LABEL: local_atomic_fsub_ret_f32__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -331,17 +331,17 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind ; GFX90A-LABEL: local_atomic_fsub_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9339,17 +9339,17 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX942-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v1, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9408,17 +9408,17 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa ; GFX90A-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v1, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 08ec0c847e941..87d52684e588c 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -632,12 +632,12 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX1100-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX1100-NEXT: v_mov_b32_e32 v3, v1 +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v5, v4, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_i64_i32_extops_i32_i64: @@ -775,13 +775,13 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100-LABEL: mad_u64_u32_bitops_lhs_mask_small: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0 -; GFX1100-NEXT: v_mov_b32_e32 v6, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX1100-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5] +; GFX1100-NEXT: v_and_b32_e32 v5, 1, v6 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v5, v2, v[1:2] +; GFX1100-NEXT: v_mov_b32_e32 v1, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_lhs_mask_small: @@ -863,11 +863,12 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v6, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3 +; GFX1100-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v6, v4, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_rhs_mask_small: @@ -1807,10 +1808,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1100-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[4:5] +; GFX1100-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4] ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,10 +1818,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[1:2] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[4:5] +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; @@ -1833,10 +1832,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[4:5] +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2126,23 +2124,21 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX1100-LABEL: lshr_mad_i64_negative_4: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] +; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1] +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[2:3] +; GFX1100-NEXT: v_mad_u64_u32 v[4:5], null, v3, v3, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1100-NEXT: v_mov_b32_e32 v1, v4 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: lshr_mad_i64_negative_4: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX1150-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mov_b32_e32 v0, v4 -; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1150-NEXT: v_mov_b32_e32 v0, v3 +; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v1, v[1:2] +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v2, v2, v[1:2] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: lshr_mad_i64_negative_4: @@ -2152,12 +2148,10 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v1, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v2, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: lshr_mad_i64_negative_4: diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index e330c72ba0fc4..fe432e9d7594d 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1376,47 +1376,46 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; ; GFX90A-LABEL: test_mfma_loop_sgpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 @@ -1438,47 +1437,46 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; ; GFX942-LABEL: test_mfma_loop_sgpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 @@ -1643,13 +1641,13 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1679,7 +1677,6 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader @@ -1706,13 +1703,13 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1742,7 +1739,6 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader @@ -2401,7 +2397,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90A-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2 @@ -2471,7 +2466,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB9_2 Depth 2 diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index d29847e40dc8b..4681d589ac217 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2989,34 +2989,33 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c ; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s12, v0, 0 ; VI-NEXT: s_mul_i32 s4, s12, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0 -; VI-NEXT: s_mul_i32 s6, s13, s10 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5] -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s14, v8, v[2:3] -; VI-NEXT: v_mov_b32_e32 v4, v6 -; VI-NEXT: v_mov_b32_e32 v6, s13 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s8, v6, v[4:5] +; VI-NEXT: v_mov_b32_e32 v5, s12 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: s_mul_i32 s4, s13, s10 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v5, 0 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s14, v7, v[3:4] +; VI-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s9, v5, v[1:2] +; VI-NEXT: v_mov_b32_e32 v7, s13 ; VI-NEXT: s_mul_i32 s6, s15, s8 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v9 -; VI-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s9, v6, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s8, v7, v[1:2] +; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_addc_u32_e64 v5, s[4:5], 0, 0, vcc +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s9, v7, v[4:5] ; VI-NEXT: s_mul_i32 s6, s14, s9 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v8 +; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v3 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -3370,67 +3369,66 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; VI-NEXT: v_mov_b32_e32 v10, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v12, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[11:12] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v3, v4, v3 -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v4, v2, 0 ; VI-NEXT: v_mul_lo_u32 v2, v5, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10] -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v6, v0, v[14:15] -; VI-NEXT: v_mov_b32_e32 v9, v2 -; VI-NEXT: v_mul_lo_u32 v2, v7, v0 -; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; VI-NEXT: v_mul_lo_u32 v10, v7, v0 +; VI-NEXT: v_mad_u64_u32 v[7:8], s[0:1], v0, v4, 0 +; VI-NEXT: v_add_u32_e32 v3, vcc, v14, v3 +; VI-NEXT: v_add_u32_e32 v14, vcc, v3, v2 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[8:9] +; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v6, v0, v[13:14] +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, v[8:9] ; VI-NEXT: v_mul_lo_u32 v4, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v2, v15 -; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v10 +; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v14 +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v9 ; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[2:3] ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v6 -; VI-NEXT: v_add_u32_e32 v10, vcc, v0, v14 -; VI-NEXT: v_addc_u32_e32 v11, vcc, v1, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: v_add_u32_e32 v9, vcc, v0, v13 +; VI-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-NEXT: v_add3_u32 v9, v9, v12, v10 -; GFX9-NEXT: v_mul_lo_u32 v15, v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v10, v12 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[10:11] -; GFX9-NEXT: v_mul_lo_u32 v10, v7, v0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, v0, v[8:9] -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v13, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[8:9] -; GFX9-NEXT: v_add3_u32 v5, v10, v7, v15 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc -; GFX9-NEXT: global_store_dwordx4 v14, v[2:5], s[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 +; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v10 +; GFX9-NEXT: v_mul_lo_u32 v14, v7, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] +; GFX9-NEXT: v_add3_u32 v3, v14, v3, v4 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: @@ -3468,37 +3466,36 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_lshlrev_b32 v17, 4, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v17, s[0:1] -; GFX11-NEXT: global_load_b128 v[4:7], v17, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 -; GFX11-NEXT: v_mul_lo_u32 v18, v5, v2 -; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 -; GFX11-NEXT: v_mad_u64_u32 v[15:16], null, v4, v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_add3_u32 v16, v16, v3, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v1, v4, v[11:12] -; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v11, v13 +; GFX11-NEXT: v_mul_lo_u32 v16, v5, v2 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v4, v2, 0 +; GFX11-NEXT: v_mul_lo_u32 v17, v6, v1 +; GFX11-NEXT: v_mul_lo_u32 v18, v7, v0 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12] -; GFX11-NEXT: v_mul_lo_u32 v12, v7, v0 -; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v0, v[15:16] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v9, v11 +; GFX11-NEXT: v_mul_lo_u32 v11, v4, v3 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, v[9:10] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add3_u32 v0, v12, v11, v4 -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v14, v14, v11, v16 +; GFX11-NEXT: v_add_co_u32 v3, s0, v12, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, 0, s0 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v6, v0, v[13:14] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[3:4] +; GFX11-NEXT: v_add3_u32 v0, v18, v10, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v9 ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, v7, v0, vcc_lo -; GFX11-NEXT: global_store_b128 v17, v[8:11], s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v9, v2 +; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index be1788c6ec83f..2462414992e36 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,15 +6,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -22,12 +22,14 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_ashr_i32 s5, s3, 31 ; GFX942-NEXT: s_mov_b32 s3, s2 ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 ; GFX942-NEXT: s_and_b32 s3, s5, s4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] +; GFX942-NEXT: s_nop 6 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a2 ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -35,7 +37,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: ; GFX942-NEXT: ; implicit-def: $sgpr3 -; GFX942-NEXT: ; implicit-def: $agpr2 +; GFX942-NEXT: ; implicit-def: $vgpr0 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 8abbdad893819..bbc04aa46adc5 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -203,28 +203,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 ; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v18, v13, v5 +; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mul_lo_u32 v9, v11, v4 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v22, v13, v[14:15] -; GFX9-NEXT: v_add3_u32 v8, v8, v18, v9 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8] -; GFX9-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v23, v11, v[14:15] +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[6:7] +; GFX9-NEXT: v_mul_lo_u32 v15, v11, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22 -; GFX9-NEXT: v_mul_lo_u32 v12, v10, v23 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v17, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v22, v11, v[9:10] -; GFX9-NEXT: v_add3_u32 v4, v12, v7, v4 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v23, v11, v[6:7] +; GFX9-NEXT: v_add3_u32 v9, v9, v16, v15 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v23, v[8:9] +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mul_lo_u32 v15, v10, v23 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v11, v[12:13] +; GFX9-NEXT: v_add3_u32 v4, v15, v9, v4 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v10, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v20 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v21 @@ -1590,25 +1589,24 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: v_mul_lo_u32 v19, v12, v7 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mul_lo_u32 v18, v13, v6 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v12, v[16:17] -; GFX9-NEXT: v_add3_u32 v10, v10, v19, v18 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, v[9:10] -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v5, v12, v[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, v[8:9] +; GFX9-NEXT: v_add3_u32 v11, v11, v19, v18 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v4, v[10:11] ; GFX9-NEXT: v_mul_lo_u32 v6, v14, v5 -; GFX9-NEXT: v_mul_lo_u32 v14, v15, v4 -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[11:12] -; GFX9-NEXT: v_add3_u32 v6, v14, v9, v6 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, v17, v9 +; GFX9-NEXT: v_mul_lo_u32 v12, v15, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[14:15] +; GFX9-NEXT: v_add3_u32 v6, v12, v11, v6 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll index 9b3dc7f531021..287d1dde21403 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2f32_v2f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2f32_v2f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2f32_v2f32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2f32_v2f32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2f32_v2f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2f32_v2f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2f32_v2f32__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2f32_v2f32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2f32_v2f32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll index 34043cd067b25..d5998e289c09d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2f32_v3f32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2f32_v3f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2f32_v3f32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2f32_v3f32__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2f32_v3f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2f32_v3f32__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2f32_v3f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2f32_v3f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2f32_v3f32__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2f32_v3f32__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2f32_v3f32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2f32_v3f32__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2f32_v3f32__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2f32_v3f32__5_3() { } define void @s_shuffle_v2f32_v3f32__5_4() { -; GFX9-LABEL: s_shuffle_v2f32_v3f32__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=s"() %vec1 = call <3 x float> asm "; def $0", "=s"() %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2f32_v3f32__1_1() { } define void @s_shuffle_v2f32_v3f32__2_1() { -; GFX9-LABEL: s_shuffle_v2f32_v3f32__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=s"() %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x float> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll index 676a521757bd8..a86ca0a4a23c6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2i32_v2i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2i32_v2i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2i32_v2i32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2i32_v2i32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2i32_v2i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2i32_v2i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2i32_v2i32__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2i32_v2i32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2i32_v2i32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll index f65340470feb1..d46ca61cff64d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2i32_v3i32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2i32_v3i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2i32_v3i32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2i32_v3i32__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2i32_v3i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2i32_v3i32__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2i32_v3i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2i32_v3i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2i32_v3i32__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2i32_v3i32__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2i32_v3i32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2i32_v3i32__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2i32_v3i32__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2i32_v3i32__5_3() { } define void @s_shuffle_v2i32_v3i32__5_4() { -; GFX9-LABEL: s_shuffle_v2i32_v3i32__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=s"() %vec1 = call <3 x i32> asm "; def $0", "=s"() %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2i32_v3i32__1_1() { } define void @s_shuffle_v2i32_v3i32__2_1() { -; GFX9-LABEL: s_shuffle_v2i32_v3i32__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=s"() %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x i32> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll index 299dfba482953..02fb06ef54d42 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2p3_v2p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2p3_v2p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2p3_v2p3__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2p3_v2p3__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2p3_v2p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2p3_v2p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2p3_v2p3__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2p3_v2p3__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2p3_v2p3__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll index 13e3d94c35446..d0f00f8363aed 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2p3_v3p3__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2p3_v3p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2p3_v3p3__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2p3_v3p3__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2p3_v3p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2p3_v3p3__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2p3_v3p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2p3_v3p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2p3_v3p3__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2p3_v3p3__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2p3_v3p3__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2p3_v3p3__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2p3_v3p3__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2p3_v3p3__5_3() { } define void @s_shuffle_v2p3_v3p3__5_4() { -; GFX9-LABEL: s_shuffle_v2p3_v3p3__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2p3_v3p3__1_1() { } define void @s_shuffle_v2p3_v3p3__2_1() { -; GFX9-LABEL: s_shuffle_v2p3_v3p3__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll index 430f64164d24f..35cf10f1135c9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v2f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3f32_v2f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3f32_v2f32__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3f32_v2f32__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3f32_v2f32__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3f32_v2f32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3f32_v2f32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3f32_v2f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3f32_v2f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3f32_v2f32__3_3_u() { } define void @s_shuffle_v3f32_v2f32__3_3_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() - %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) - ret void -} - -define void @s_shuffle_v3f32_v2f32__3_3_1() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x float> asm "; def $0", "=s"() - %vec1 = call <2 x float> asm "; def $0", "=s"() - %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) ret void } define void @s_shuffle_v3f32_v2f32__3_3_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3f32_v2f32__3_3_3() { } define void @s_shuffle_v3f32_v2f32__u_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3f32_v2f32__0_0_0() { } define void @s_shuffle_v3f32_v2f32__1_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3f32_v2f32__1_0_0() { } define void @s_shuffle_v3f32_v2f32__2_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3f32_v2f32__2_2_2() { } define void @s_shuffle_v3f32_v2f32__3_2_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3f32_v2f32__3_2_2() { } define void @s_shuffle_v3f32_v2f32__3_u_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { } define void @s_shuffle_v3f32_v2f32__3_1_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll index ef670e963bdb6..befc1126d6fa4 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3f32_v3f32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3f32_v3f32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3f32_v3f32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3f32_v3f32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3f32_v3f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3f32_v3f32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3f32_v3f32__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3f32_v3f32__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3f32_v3f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3f32_v3f32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3f32_v3f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3f32_v3f32__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3f32_v3f32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3f32_v3f32__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3f32_v3f32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3f32_v3f32__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3f32_v3f32__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3f32_v3f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3f32_v3f32__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll index ea4fac3b1d2b1..51d45922893b3 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v2i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3i32_v2i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3i32_v2i32__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3i32_v2i32__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3i32_v2i32__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3i32_v2i32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3i32_v2i32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3i32_v2i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3i32_v2i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3i32_v2i32__3_3_u() { } define void @s_shuffle_v3i32_v2i32__3_3_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() - %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) - ret void -} - -define void @s_shuffle_v3i32_v2i32__3_3_1() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x i32> asm "; def $0", "=s"() - %vec1 = call <2 x i32> asm "; def $0", "=s"() - %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) ret void } define void @s_shuffle_v3i32_v2i32__3_3_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3i32_v2i32__3_3_3() { } define void @s_shuffle_v3i32_v2i32__u_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3i32_v2i32__0_0_0() { } define void @s_shuffle_v3i32_v2i32__1_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3i32_v2i32__1_0_0() { } define void @s_shuffle_v3i32_v2i32__2_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3i32_v2i32__2_2_2() { } define void @s_shuffle_v3i32_v2i32__3_2_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3i32_v2i32__3_2_2() { } define void @s_shuffle_v3i32_v2i32__3_u_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { } define void @s_shuffle_v3i32_v2i32__3_1_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll index 7061c13b28d03..89e6a2918a68c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3i32_v3i32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3i32_v3i32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3i32_v3i32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3i32_v3i32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3i32_v3i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3i32_v3i32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3i32_v3i32__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3i32_v3i32__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3i32_v3i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3i32_v3i32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3i32_v3i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3i32_v3i32__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3i32_v3i32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3i32_v3i32__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3i32_v3i32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3i32_v3i32__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3i32_v3i32__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3i32_v3i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3i32_v3i32__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll index bd0100a4ffdb5..25e087bd922ac 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v2p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3p3_v2p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3p3_v2p3__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3p3_v2p3__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3p3_v2p3__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3p3_v2p3__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3p3_v2p3__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3p3_v2p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3p3_v2p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3p3_v2p3__3_3_u() { } define void @s_shuffle_v3p3_v2p3__3_3_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_3_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) - ret void -} - -define void @s_shuffle_v3p3_v2p3__3_3_1() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) ret void } define void @s_shuffle_v3p3_v2p3__3_3_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3p3_v2p3__3_3_3() { } define void @s_shuffle_v3p3_v2p3__u_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3p3_v2p3__0_0_0() { } define void @s_shuffle_v3p3_v2p3__1_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3p3_v2p3__1_0_0() { } define void @s_shuffle_v3p3_v2p3__2_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3p3_v2p3__2_2_2() { } define void @s_shuffle_v3p3_v2p3__3_2_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3p3_v2p3__3_2_2() { } define void @s_shuffle_v3p3_v2p3__3_u_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { } define void @s_shuffle_v3p3_v2p3__3_1_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll index cecd2a0e4b015..62b9da9fedb95 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3p3_v3p3__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3p3_v3p3__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3p3_v3p3__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3p3_v3p3__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3p3_v3p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3p3_v3p3__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3p3_v3p3__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3p3_v3p3__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3p3_v3p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3p3_v3p3__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3p3_v3p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3p3_v3p3__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3p3_v3p3__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3p3_v3p3__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3p3_v3p3__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3p3_v3p3__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3p3_v3p3__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3p3_v3p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3p3_v3p3__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll index fa422e48bbce0..89ce868b03546 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll index ab297c02fe3b5..8e24d6e02f3ff 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4bf16_v4bf16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll index e91433ac4c1f7..d1ff8c658c77d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4f16_v3f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4f16_v3f16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4f16_v3f16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll index 47100b9983559..8a9a0d1a7ef5d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4f16_v4f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4f16_v4f16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4f16_v4f16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index 7c8417837f788..5828e40595f9f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -3985,12 +3985,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,11 +4003,10 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4105,12 +4103,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4124,12 +4121,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6709,16 +6705,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6726,17 +6722,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6871,28 +6867,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll index 7b3a5a879f44f..1a7e281e7e138 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4i16_v3i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4i16_v3i16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4i16_v3i16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll index 2a371b7c7d2d3..05ebf49b997eb 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4i16_v4i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4i16_v4i16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4i16_v4i16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index f7149350e74d3..3a659e1753e97 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -3985,12 +3985,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,11 +4003,10 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4105,12 +4103,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4124,12 +4121,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6709,16 +6705,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6726,17 +6722,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6871,28 +6867,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index aa9e23b971823..f1c1e4b20f242 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -3985,12 +3985,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,11 +4003,10 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4105,12 +4103,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4124,12 +4121,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6709,16 +6705,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6726,17 +6722,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6871,28 +6867,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 1b4ed67eb6eea..94448411cfd0e 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -2625,12 +2625,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2653,12 +2652,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2703,13 +2701,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v3i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v3, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v2, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v8, v5, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v0, v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v0, v3, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v7, v4, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v6, v2, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v7, v5, v[1:2] ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v4, v[5:6] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2731,16 +2728,16 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v3i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v6, v2, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v6, v3, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v7, v2, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v5, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v11, v4, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v0, v2, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v0, v3, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v4, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v6, v2, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v7, v5, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v10, v4, v[8:9] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2810,18 +2807,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2847,18 +2842,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2915,19 +2908,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v2, v6, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v8, v4, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v2, v7, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v10, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v8, v5, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[14:15] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v4, v[7:8] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v2, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v10, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v0, v4, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v7, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v0, v5, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v11, v9, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v8, v4, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v11, v2, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v9, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2953,23 +2943,19 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v2, v6, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v8, v4, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v2, v7, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v8, v5, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v3, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v4, v[15:16] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v2, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v0, v4, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v2, v7, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v0, v5, v[12:13] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v9, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v7, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v10, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v3, v6, v[13:14] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v4, v[14:15] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v11, v15, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v9, v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -3070,29 +3056,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v21 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3139,29 +3122,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v21 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3266,34 +3246,27 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v0, v8, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v2, v10, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v16, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[21:22], s4, v6, v14, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[23:24], s4, v4, v12, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v18 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v0, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v11, v[18:19] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v22 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v24 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[29:30], s4, v4, v13, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v6, v15, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[27:28], s4, v19, v21, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v17, v23, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v10, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v14, v[26:27] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v28 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v5, v12, v[29:30] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v31 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v19, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v16, v8, v[25:26] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v30, v27, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v3, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v2, v21, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v6, v23, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v30, v4, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v27, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v6, v14, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[18:19], s4, v0, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v10, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v4, v12, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v15, v[17:18] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v0, v9, v[19:20] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v2, v11, v[21:22] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[27:28], s4, v4, v13, v[23:24] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v7, v14, v[24:25] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v20, v16, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v3, v10, v[26:27] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v12, v[27:28] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v22, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v20, v13, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v1, v8, v[25:26] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v18, v4, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v9, v16, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v22, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v4, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v6, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3337,39 +3310,34 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v0, v8, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[19:20], null, v2, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[21:22], null, v6, v14, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[23:24], null, v4, v12, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_mov_b32 v1, v18 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v0, v9, v[1:2] -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, v22 :: v_dual_mov_b32 v1, v24 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v2, v11, v[18:19] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v6, v15, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[30:31], null, v4, v13, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[28:29], null, v19, v21, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v17, v23, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v14, v[27:28] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[26:27] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v12, v[30:31] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v29 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, v32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v8, v[25:26] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v19, v0, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v31, v28, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v17, v2, v[3:4] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v21, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v9, v23, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v31, v2, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v28, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v6, v14, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[18:19], null, v0, v8, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[20:21], null, v2, v10, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[22:23], null, v4, v12, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[24:25], null, v6, v15, v[17:18] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v0, v9, v[19:20] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v2, v11, v[21:22] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v4, v13, v[23:24] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[28:29], null, v7, v14, v[24:25] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v20, v16, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v10, v[26:27] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v5, v12, v[27:28] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v18, v22, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v20, v28, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v1, v8, v[25:26] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v2, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v18, v9, v[3:4] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v16, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v10, v22, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v2, v8, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3551,60 +3519,49 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v9 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] -; GFX7-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v32 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] +; GFX7-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3696,60 +3653,49 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v9 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] -; GFX8-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v32 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] +; GFX8-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3956,66 +3902,53 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v0, v16, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v2, v18, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v6, v22, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v0, v17, v[32:33] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v4, v20, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v34 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v1, v16, v[35:36] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v19, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v37 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v39 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v8, v24, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[48:49], s4, v4, v21, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[49:50], s4, v10, v26, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v23, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v35 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v20, v[48:49] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[51:52], s4, v8, v25, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v50 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[52:53], s4, v10, v27, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[53:54], s4, v12, v28, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v9, v24, v[51:52] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v54 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v11, v26, v[52:53] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v36, v53, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[54:55], s4, v12, v29, v[2:3] -; GFX10-GISEL-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v18, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v14, v30, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v33, v49, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v22, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v31, v34, 0 +; GFX10-GISEL-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v6, v22, 0 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v14, v12, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v38, v3, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v13, v28, v[54:55] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v15, v30, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v18 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v31, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v33, v10, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v38, v14, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v17, v11, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v36, v13, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v7, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v3, v[14:15] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v21 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v49, v[9:10] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v53, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v17, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v20, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v4, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v16, v34, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v11, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v13, v19, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v20, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v14, v35, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v6, v23, v[34:35] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v33, v31, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v15, v30, v[36:37] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v0, v16, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v7, v22, v[37:38] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v2, v18, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v4, v20, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v33, v38, v[35:36] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v0, v17, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v2, v19, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v36, v31, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v8, v24, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v10, v26, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[15:16], s4, v1, v16, v[37:38] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v3, v18, v[38:39] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v28, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v4, v21, v[23:24] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v8, v25, v[31:32] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v10, v27, v[33:34] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v12, v29, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[18:19], s4, v5, v20, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v9, v24, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v11, v26, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v32, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v13, v28, v[17:18] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v22, v0, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v6, v9, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v22, v10, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v34, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v14, v19, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v1, v7, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v32, v[11:12] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v18, v0, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v15, v30, v[13:14] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v35, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v1, v11, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v34, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v12, v7, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v2, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v8, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -4096,66 +4029,62 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: scratch_load_b32 v71, off, s32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v0, v16, 0 +; GFX11-GISEL-NEXT: scratch_load_b32 v55, off, s32 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[33:34], null, v2, v18, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[35:36], null, v4, v20, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v0, v16, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[37:38], null, v6, v22, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[50:51], null, v10, v26, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v2, v19, v[34:35] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[70:71], null, v0, v17, v[32:33] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[83:84], null, v4, v21, v[36:37] ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[52:53], null, v12, v28, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[64:65], null, v14, v30, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[66:67], null, v33, v50, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[48:49], null, v8, v24, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[54:55], null, v14, v30, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v0, v17, v[32:33] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[83:84], null, v2, v19, v[34:35] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v4, v21, v[36:37] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[85:86], null, v6, v23, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v10, v27, v[51:52] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[65:66], null, v31, v48, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v6, v23, v[38:39] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v1, v16, v[70:71] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v18, v[82:83] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[85:86], null, v10, v27, v[51:52] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v12, v29, v[53:54] ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[38:39], null, v8, v25, v[49:50] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v64, v55 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v12, v29, v[53:54] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[97:98], null, v1, v16, v[82:83] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v18, v[83:84] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v20, v[84:85] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v7, v22, v[85:86] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[67:68], null, v33, v50, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v37, v54, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v66 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[69:70], null, v35, v52, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v37, v64, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v5, v20, v[83:84] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[68:69], null, v35, v52, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v7, v22, v[84:85] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v24, v[38:39] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v28, v[86:87] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[53:54], null, v31, v48, 0 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v14, v71, v[64:65] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v24, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v11, v26, v[86:87] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v28, v[96:97] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v30, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v68 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v31, v5, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v81 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v33, v6, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v70 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v14, v55, v[65:66] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v15, v30, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v11, v26, v[85:86] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v53, v68, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v37, v4, v[81:82] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v33, v1, v[67:68] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v35, v7, v[69:70] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v66, v80, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v17, v64, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v31, v6, v[54:55] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v0, v50, v[9:10] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v37, v8, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v67, v80, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v35, v7, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v65, v69, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v66, v11, v[5:6] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v16, v52, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v96, v48, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v53, v0, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v12, v80, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v54, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v1, v50, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v2, v52, v[13:14] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v67, v14, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v97, v48, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v11, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v65, v4, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v3, v80, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v12, v69, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v6, v4, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v9, v11, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v5, v68, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v2, v9, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v7, v4, v[5:6] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v16i64: